From d0cef713cf19fa75011bd7ab53c20c168cfd3d93 Mon Sep 17 00:00:00 2001 From: Sadam Hussain Memon Date: Thu, 10 Jun 2021 05:51:30 +0500 Subject: [PATCH 001/258] `mixed_precision` set to false Change default value of `"mixed_precision" : false` as when it is set true it leads to `raise RuntimeError(f" [!] NaN loss with {key}.") RuntimeError: [!] NaN loss with decoder_loss.` --- recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json b/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json index 9cdbbd3b..dd3b71db 100644 --- a/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json +++ b/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json @@ -41,7 +41,7 @@ "run_description": "tacotron2 with double decoder consistency.", "batch_size": 64, "eval_batch_size": 16, - "mixed_precision": true, + "mixed_precision": false, "loss_masking": true, "decoder_loss_alpha": 0.25, "postnet_loss_alpha": 0.25, @@ -88,4 +88,4 @@ "phoneme_cache_path": "DEFINE THIS", "use_phonemes": false, "phoneme_language": "en-us" -} \ No newline at end of file +} From 0f284841d1bf2a4d17486675773a327920cec783 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 20 May 2021 18:22:52 +0200 Subject: [PATCH 002/258] rename MyDataset -> TTSDataset --- TTS/bin/compute_attention_masks.py | 4 ++-- TTS/bin/extract_tts_spectrograms.py | 4 ++-- TTS/bin/train_align_tts.py | 4 ++-- TTS/bin/train_glow_tts.py | 4 ++-- TTS/bin/train_speedy_speech.py | 4 ++-- TTS/bin/train_tacotron.py | 4 ++-- TTS/tts/datasets/TTSDataset.py | 10 +++++----- notebooks/ExtractTTSpectrogram.ipynb | 4 ++-- tests/data_tests/test_loader.py | 2 +- 9 files changed, 20 insertions(+), 20 deletions(-) diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index 0a4337da..e14ff433 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -8,7 +8,7 @@ import torch from torch.utils.data import DataLoader from tqdm import tqdm -from TTS.tts.datasets.TTSDataset import MyDataset +from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.utils.generic_utils import setup_model from TTS.tts.utils.io import load_checkpoint from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols @@ -83,7 +83,7 @@ Example run: preprocessor = importlib.import_module("TTS.tts.datasets.preprocess") preprocessor = getattr(preprocessor, args.dataset) meta_data = preprocessor(args.data_path, args.dataset_metafile) - dataset = MyDataset( + dataset = TTSDataset( model.decoder.r, C.text_cleaner, compute_linear_spec=False, diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index ace7464a..e8814a11 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -11,7 +11,7 @@ from tqdm import tqdm from TTS.config import load_config from TTS.tts.datasets.preprocess import load_meta_data -from TTS.tts.datasets.TTSDataset import MyDataset +from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.utils.generic_utils import setup_model from TTS.tts.utils.speakers import parse_speakers from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols @@ -22,7 +22,7 @@ use_cuda = torch.cuda.is_available() def setup_loader(ap, r, verbose=False): - dataset = MyDataset( + dataset = TTSDataset( r, c.text_cleaner, compute_linear_spec=False, diff --git a/TTS/bin/train_align_tts.py b/TTS/bin/train_align_tts.py index 7e3921b0..f5658dd2 100644 --- a/TTS/bin/train_align_tts.py +++ b/TTS/bin/train_align_tts.py @@ -14,7 +14,7 @@ from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler from TTS.tts.datasets.preprocess import load_meta_data -from TTS.tts.datasets.TTSDataset import MyDataset +from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import AlignTTSLoss from TTS.tts.utils.generic_utils import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint @@ -38,7 +38,7 @@ def setup_loader(ap, r, is_val=False, verbose=False): if is_val and not config.run_eval: loader = None else: - dataset = MyDataset( + dataset = TTSDataset( r, config.text_cleaner, compute_linear_spec=False, diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index e93a4e8a..50e95a2b 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -15,7 +15,7 @@ from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler from TTS.tts.datasets.preprocess import load_meta_data -from TTS.tts.datasets.TTSDataset import MyDataset +from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import GlowTTSLoss from TTS.tts.utils.generic_utils import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint @@ -38,7 +38,7 @@ def setup_loader(ap, r, is_val=False, verbose=False): if is_val and not config.run_eval: loader = None else: - dataset = MyDataset( + dataset = TTSDataset( r, config.text_cleaner, compute_linear_spec=False, diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py index 2fba3df1..4ab0c899 100644 --- a/TTS/bin/train_speedy_speech.py +++ b/TTS/bin/train_speedy_speech.py @@ -16,7 +16,7 @@ from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler from TTS.tts.datasets.preprocess import load_meta_data -from TTS.tts.datasets.TTSDataset import MyDataset +from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import SpeedySpeechLoss from TTS.tts.utils.generic_utils import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint @@ -39,7 +39,7 @@ def setup_loader(ap, r, is_val=False, verbose=False): if is_val and not config.run_eval: loader = None else: - dataset = MyDataset( + dataset = TTSDataset( r, config.text_cleaner, compute_linear_spec=False, diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index 9685d0d7..098a8d3f 100755 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -12,7 +12,7 @@ import torch from torch.utils.data import DataLoader from TTS.tts.datasets.preprocess import load_meta_data -from TTS.tts.datasets.TTSDataset import MyDataset +from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import TacotronLoss from TTS.tts.utils.generic_utils import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint @@ -43,7 +43,7 @@ def setup_loader(ap, r, is_val=False, verbose=False, dataset=None): loader = None else: if dataset is None: - dataset = MyDataset( + dataset = TTSDataset( r, config.text_cleaner, compute_linear_spec=config.model.lower() == "tacotron", diff --git a/TTS/tts/datasets/TTSDataset.py b/TTS/tts/datasets/TTSDataset.py index 4ca93232..cbb0a593 100644 --- a/TTS/tts/datasets/TTSDataset.py +++ b/TTS/tts/datasets/TTSDataset.py @@ -12,7 +12,7 @@ from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor from TTS.tts.utils.text import pad_with_eos_bos, phoneme_to_sequence, text_to_sequence -class MyDataset(Dataset): +class TTSDataset(Dataset): def __init__( self, outputs_per_step, @@ -117,12 +117,12 @@ class MyDataset(Dataset): try: phonemes = np.load(cache_path) except FileNotFoundError: - phonemes = MyDataset._generate_and_cache_phoneme_sequence( + phonemes = TTSDataset._generate_and_cache_phoneme_sequence( text, cache_path, cleaners, language, tp, add_blank ) except (ValueError, IOError): print(" [!] failed loading phonemes for {}. " "Recomputing.".format(wav_file)) - phonemes = MyDataset._generate_and_cache_phoneme_sequence( + phonemes = TTSDataset._generate_and_cache_phoneme_sequence( text, cache_path, cleaners, language, tp, add_blank ) if enable_eos_bos: @@ -190,7 +190,7 @@ class MyDataset(Dataset): item = args[0] func_args = args[1] text, wav_file, *_ = item - phonemes = MyDataset._load_or_generate_phoneme_sequence(wav_file, text, *func_args) + phonemes = TTSDataset._load_or_generate_phoneme_sequence(wav_file, text, *func_args) return phonemes def compute_input_seq(self, num_workers=0): @@ -225,7 +225,7 @@ class MyDataset(Dataset): with Pool(num_workers) as p: phonemes = list( tqdm.tqdm( - p.imap(MyDataset._phoneme_worker, [[item, func_args] for item in self.items]), + p.imap(TTSDataset._phoneme_worker, [[item, func_args] for item in self.items]), total=len(self.items), ) ) diff --git a/notebooks/ExtractTTSpectrogram.ipynb b/notebooks/ExtractTTSpectrogram.ipynb index dc35e86f..bdc7c955 100644 --- a/notebooks/ExtractTTSpectrogram.ipynb +++ b/notebooks/ExtractTTSpectrogram.ipynb @@ -22,7 +22,7 @@ "import numpy as np\n", "from tqdm import tqdm as tqdm\n", "from torch.utils.data import DataLoader\n", - "from TTS.tts.datasets.TTSDataset import MyDataset\n", + "from TTS.tts.datasets.TTSDataset import TTSDataset\n", "from TTS.tts.layers.losses import L1LossMasked\n", "from TTS.utils.audio import AudioProcessor\n", "from TTS.utils.io import load_config\n", @@ -112,7 +112,7 @@ "preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n", "preprocessor = getattr(preprocessor, DATASET.lower())\n", "meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n", - "dataset = MyDataset(checkpoint['r'], C.text_cleaner, False, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n", + "dataset = TTSDataset(checkpoint['r'], C.text_cleaner, False, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n", "loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False)" ] }, diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index e2dba37a..053da516 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -38,7 +38,7 @@ class TestTTSDataset(unittest.TestCase): def _create_dataloader(self, batch_size, r, bgs): items = ljspeech(c.data_path, "metadata.csv") - dataset = TTSDataset.MyDataset( + dataset = TTSDataset.TTSDataset( r, c.text_cleaner, compute_linear_spec=True, From 7bdd0eb72f3a51f4b29603dd42bd80efaf7dadc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 20 May 2021 18:23:53 +0200 Subject: [PATCH 003/258] trainer-API updates --- TTS/bin/train_align_tts.py | 2 +- TTS/bin/train_glow_tts.py | 2 +- TTS/bin/train_speedy_speech.py | 2 +- TTS/bin/train_tacotron.py | 2 +- TTS/bin/train_vocoder_gan.py | 2 +- TTS/bin/train_vocoder_wavegrad.py | 2 +- TTS/bin/train_vocoder_wavernn.py | 2 +- TTS/tts/configs/shared_configs.py | 20 ++++++++++++++++ TTS/tts/configs/tacotron_config.py | 21 +++++++++++------ TTS/tts/utils/speakers.py | 29 +++++++++++++----------- TTS/tts/utils/text/cleaners.py | 6 ++--- TTS/utils/tensorboard_logger.py | 2 +- tests/vocoder_tests/test_melgan_train.py | 1 + 13 files changed, 62 insertions(+), 31 deletions(-) diff --git a/TTS/bin/train_align_tts.py b/TTS/bin/train_align_tts.py index f5658dd2..d231484a 100644 --- a/TTS/bin/train_align_tts.py +++ b/TTS/bin/train_align_tts.py @@ -229,7 +229,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, if global_step % config.tb_plot_step == 0: iter_stats = {"lr": current_lr, "grad_norm": grad_norm, "step_time": step_time} iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) + tb_logger.tb_train_step_stats(global_step, iter_stats) if global_step % config.save_step == 0: if config.checkpoint: diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index 50e95a2b..9a455a1b 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -270,7 +270,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, if global_step % config.tb_plot_step == 0: iter_stats = {"lr": current_lr, "grad_norm": grad_norm, "step_time": step_time} iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) + tb_logger.tb_train_step_stats(global_step, iter_stats) if global_step % config.save_step == 0: if config.checkpoint: diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py index 4ab0c899..742a27d8 100644 --- a/TTS/bin/train_speedy_speech.py +++ b/TTS/bin/train_speedy_speech.py @@ -256,7 +256,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, if global_step % config.tb_plot_step == 0: iter_stats = {"lr": current_lr, "grad_norm": grad_norm, "step_time": step_time} iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) + tb_logger.tb_train_step_stats(global_step, iter_stats) if global_step % config.save_step == 0: if config.checkpoint: diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index 098a8d3f..b5e38b80 100755 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -327,7 +327,7 @@ def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler, ap, "step_time": step_time, } iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) + tb_logger.tb_train_step_stats(global_step, iter_stats) if global_step % config.save_step == 0: if config.checkpoint: diff --git a/TTS/bin/train_vocoder_gan.py b/TTS/bin/train_vocoder_gan.py index 123d5a43..ea317ef6 100755 --- a/TTS/bin/train_vocoder_gan.py +++ b/TTS/bin/train_vocoder_gan.py @@ -265,7 +265,7 @@ def train( if global_step % 10 == 0: iter_stats = {"lr_G": current_lr_G, "lr_D": current_lr_D, "step_time": step_time} iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) + tb_logger.tb_train_step_stats(global_step, iter_stats) # save checkpoint if global_step % c.save_step == 0: diff --git a/TTS/bin/train_vocoder_wavegrad.py b/TTS/bin/train_vocoder_wavegrad.py index c0fcff51..c8f067ee 100644 --- a/TTS/bin/train_vocoder_wavegrad.py +++ b/TTS/bin/train_vocoder_wavegrad.py @@ -181,7 +181,7 @@ def train(model, criterion, optimizer, scheduler, scaler, ap, global_step, epoch if global_step % 10 == 0: iter_stats = {"lr": current_lr, "grad_norm": grad_norm.item(), "step_time": step_time} iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) + tb_logger.tb_train_step_stats(global_step, iter_stats) # save checkpoint if global_step % c.save_step == 0: diff --git a/TTS/bin/train_vocoder_wavernn.py b/TTS/bin/train_vocoder_wavernn.py index bcad9493..86a1506a 100644 --- a/TTS/bin/train_vocoder_wavernn.py +++ b/TTS/bin/train_vocoder_wavernn.py @@ -163,7 +163,7 @@ def train(model, optimizer, criterion, scheduler, scaler, ap, global_step, epoch if global_step % 10 == 0: iter_stats = {"lr": cur_lr, "step_time": step_time} iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) + tb_logger.tb_train_step_stats(global_step, iter_stats) # save checkpoint if global_step % c.save_step == 0: diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index 4690e76f..15adff45 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -131,6 +131,18 @@ class BaseTTSConfig(BaseTrainingConfig): datasets (List[BaseDatasetConfig]): List of datasets used for training. If multiple datasets are provided, they are merged and used together for training. + optimizer (str): + Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`. + Defaults to ``. + optimizer_params (dict): + Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}` + lr_scheduler (str): + Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or + `TTS.utils.training`. Defaults to ``. + lr_scheduler_params (dict): + Parameters for the generator learning rate scheduler. Defaults to `{"warmup": 4000}`. + test_sentences (List[str]): + List of sentences to be used at testing. Defaults to '[]' """ audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) @@ -155,3 +167,11 @@ class BaseTTSConfig(BaseTrainingConfig): add_blank: bool = False # dataset datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) + # optimizer + optimizer: str = MISSING + optimizer_params: dict = MISSING + # scheduler + lr_scheduler: str = '' + lr_scheduler_params: dict = field(default_factory=lambda: {}) + # testing + test_sentences: List[str] = field(default_factory=lambda:[]) diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index a567cd88..ff8d89bb 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -78,10 +78,16 @@ class TacotronConfig(BaseTTSConfig): enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. external_speaker_embedding_file (str): Path to the file including pre-computed speaker embeddings. Defaults to None. - noam_schedule (bool): - enable / disable the use of Noam LR scheduler. Defaults to False. - warmup_steps (int): - Number of warm-up steps for the Noam scheduler. Defaults 4000. + optimizer (str): + Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`. + Defaults to `RAdam`. + optimizer_params (dict): + Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}` + lr_scheduler (str): + Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or + `TTS.utils.training`. Defaults to `NoamLR`. + lr_scheduler_params (dict): + Parameters for the generator learning rate scheduler. Defaults to `{"warmup": 4000}`. lr (float): Initial learning rate. Defaults to `1e-4`. wd (float): @@ -152,10 +158,11 @@ class TacotronConfig(BaseTTSConfig): external_speaker_embedding_file: str = False # optimizer parameters - noam_schedule: bool = False - warmup_steps: int = 4000 + optimizer: str = "RAdam" + optimizer_params: dict = field(default_factory=lambda: {'betas': [0.9, 0.998], 'weight_decay': 1e-6}) + lr_scheduler: str = "NoamLR" + lr_scheduler_params: dict = field(default_factory=lambda:{"warmup_steps": 4000}) lr: float = 1e-4 - wd: float = 1e-6 grad_clip: float = 5.0 seq_len_norm: bool = False loss_masking: bool = True diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 84da1f72..4ab78f88 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -1,7 +1,7 @@ import json import os import random -from typing import Union +from typing import Union, List, Any import numpy as np import torch @@ -35,9 +35,7 @@ def save_speaker_mapping(out_path, speaker_mapping): def get_speakers(items): - """Returns a sorted, unique list of speakers in a given dataset.""" - speakers = {e[2] for e in items} - return sorted(speakers) + def parse_speakers(c, args, meta_data_train, OUT_PATH): @@ -121,26 +119,31 @@ class SpeakerManager: Args: x_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "". - speaker_id_file_path (str, optional): Path to the metafile that maps speaker names to ids used by the - TTS model. Defaults to "". + speaker_id_file_path (str, optional): Path to the metafile that maps speaker names to ids used by + TTS models. Defaults to "". encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "". encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "". """ def __init__( self, + data_items: List[List[Any]] = None, x_vectors_file_path: str = "", speaker_id_file_path: str = "", encoder_model_path: str = "", encoder_config_path: str = "", ): - self.x_vectors = None - self.speaker_ids = None - self.clip_ids = None + self.data_items = [] + self.x_vectors = [] + self.speaker_ids = [] + self.clip_ids = [] self.speaker_encoder = None self.speaker_encoder_ap = None + if data_items: + self.speaker_ids = self.parse_speakers() + if x_vectors_file_path: self.load_x_vectors_file(x_vectors_file_path) @@ -169,10 +172,10 @@ class SpeakerManager: return len(self.x_vectors[list(self.x_vectors.keys())[0]]["embedding"]) def parser_speakers_from_items(self, items: list): - speaker_ids = sorted({item[2] for item in items}) - self.speaker_ids = speaker_ids - num_speakers = len(speaker_ids) - return speaker_ids, num_speakers + speakers = sorted({item[2] for item in items}) + self.speaker_ids = {name: i for i, name in enumerate(speakers)} + num_speakers = len(self.speaker_ids) + return self.speaker_ids, num_speakers def save_ids_file(self, file_path: str): self._save_json(file_path, self.speaker_ids) diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 3d2caa97..4b041ed8 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -65,7 +65,7 @@ def basic_cleaners(text): def transliteration_cleaners(text): """Pipeline for non-English text that transliterates to ASCII.""" - text = convert_to_ascii(text) + # text = convert_to_ascii(text) text = lowercase(text) text = collapse_whitespace(text) return text @@ -89,7 +89,7 @@ def basic_turkish_cleaners(text): def english_cleaners(text): """Pipeline for English text, including number and abbreviation expansion.""" - text = convert_to_ascii(text) + # text = convert_to_ascii(text) text = lowercase(text) text = expand_time_english(text) text = expand_numbers(text) @@ -129,7 +129,7 @@ def chinese_mandarin_cleaners(text: str) -> str: def phoneme_cleaners(text): """Pipeline for phonemes mode, including number and abbreviation expansion.""" text = expand_numbers(text) - text = convert_to_ascii(text) + # text = convert_to_ascii(text) text = expand_abbreviations(text) text = replace_symbols(text) text = remove_aux_symbols(text) diff --git a/TTS/utils/tensorboard_logger.py b/TTS/utils/tensorboard_logger.py index 3874a42b..657deb5b 100644 --- a/TTS/utils/tensorboard_logger.py +++ b/TTS/utils/tensorboard_logger.py @@ -39,7 +39,7 @@ class TensorboardLogger(object): except RuntimeError: traceback.print_exc() - def tb_train_iter_stats(self, step, stats): + def tb_train_step_stats(self, step, stats): self.dict_to_tb_scalar(f"{self.model_name}_TrainIterStats", stats, step) def tb_train_epoch_stats(self, step, stats): diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py index 3ff65b5a..e3004db7 100644 --- a/tests/vocoder_tests/test_melgan_train.py +++ b/tests/vocoder_tests/test_melgan_train.py @@ -21,6 +21,7 @@ config = MelganConfig( print_step=1, discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, print_eval=True, + discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, data_path="tests/data/ljspeech", output_path=output_path, ) From b9d4355d2007b1fda1aa9c8b85c037638743cce4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 10:36:26 +0200 Subject: [PATCH 004/258] set test_sentences in config --- TTS/tts/configs/tacotron_config.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index ff8d89bb..2fc7cc78 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -1,4 +1,4 @@ -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig, GSTConfig @@ -176,6 +176,15 @@ class TacotronConfig(BaseTTSConfig): postnet_ssim_alpha: float = 0.25 ga_alpha: float = 5.0 + # testing + test_sentences: List[str] = field(default_factory=lambda:[ + "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + "Be a voice, not an echo.", + "I'm sorry Dave. I'm afraid I can't do that.", + "This cake is great. It's so delicious and moist.", + "Prior to November 22, 1963." + ]) + def check_values(self): if self.gradual_training: assert ( From facb782851d431fe3eee6e1b9c8f83363fae4ecf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 10:37:01 +0200 Subject: [PATCH 005/258] move load_meta_data and related functions to `datasets/__init__.py` --- TTS/tts/datasets/__init__.py | 88 ++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index e69de29b..b238209f 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -0,0 +1,88 @@ +import sys +import numpy as np +from collections import Counter +from pathlib import Path +from TTS.tts.datasets.TTSDataset import TTSDataset +from TTS.tts.datasets.formatters import * + +#################### +# UTILITIES +#################### + + +def split_dataset(items): + speakers = [item[-1] for item in items] + is_multi_speaker = len(set(speakers)) > 1 + eval_split_size = min(500, int(len(items) * 0.01)) + assert eval_split_size > 0, " [!] You do not have enough samples to train. You need at least 100 samples." + np.random.seed(0) + np.random.shuffle(items) + if is_multi_speaker: + items_eval = [] + speakers = [item[-1] for item in items] + speaker_counter = Counter(speakers) + while len(items_eval) < eval_split_size: + item_idx = np.random.randint(0, len(items)) + speaker_to_be_removed = items[item_idx][-1] + if speaker_counter[speaker_to_be_removed] > 1: + items_eval.append(items[item_idx]) + speaker_counter[speaker_to_be_removed] -= 1 + del items[item_idx] + return items_eval, items + return items[:eval_split_size], items[eval_split_size:] + + +def load_meta_data(datasets, eval_split=True): + meta_data_train_all = [] + meta_data_eval_all = [] if eval_split else None + for dataset in datasets: + name = dataset["name"] + root_path = dataset["path"] + meta_file_train = dataset["meta_file_train"] + meta_file_val = dataset["meta_file_val"] + # setup the right data processor + preprocessor = _get_preprocessor_by_name(name) + # load train set + meta_data_train = preprocessor(root_path, meta_file_train) + print( + f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}" + ) + # load evaluation split if set + if eval_split: + if meta_file_val: + meta_data_eval = preprocessor(root_path, meta_file_val) + else: + meta_data_eval, meta_data_train = split_dataset( + meta_data_train) + meta_data_eval_all += meta_data_eval + meta_data_train_all += meta_data_train + # load attention masks for duration predictor training + if dataset.meta_file_attn_mask: + meta_data = dict( + load_attention_mask_meta_data(dataset["meta_file_attn_mask"])) + for idx, ins in enumerate(meta_data_train_all): + attn_file = meta_data[ins[1]].strip() + meta_data_train_all[idx].append(attn_file) + if meta_data_eval_all: + for idx, ins in enumerate(meta_data_eval_all): + attn_file = meta_data[ins[1]].strip() + meta_data_eval_all[idx].append(attn_file) + return meta_data_train_all, meta_data_eval_all + + +def load_attention_mask_meta_data(metafile_path): + """Load meta data file created by compute_attention_masks.py""" + with open(metafile_path, "r") as f: + lines = f.readlines() + + meta_data = [] + for line in lines: + wav_file, attn_file = line.split("|") + meta_data.append([wav_file, attn_file]) + return meta_data + + +def _get_preprocessor_by_name(name): + """Returns the respective preprocessing function.""" + thismodule = sys.modules[__name__] + return getattr(thismodule, name.lower()) From f07209d2e08828fdc64f4cb3f20a455a53de288e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 10:38:05 +0200 Subject: [PATCH 006/258] rename preprocess.py -> formatters.py --- .../datasets/{preprocess.py => formatters.py} | 81 ------------------- TTS/utils/arguments.py | 39 ++++++--- 2 files changed, 26 insertions(+), 94 deletions(-) rename TTS/tts/datasets/{preprocess.py => formatters.py} (80%) diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/formatters.py similarity index 80% rename from TTS/tts/datasets/preprocess.py rename to TTS/tts/datasets/formatters.py index 62cb9fef..f43733b1 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/formatters.py @@ -1,93 +1,12 @@ import os import re -import sys import xml.etree.ElementTree as ET -from collections import Counter from glob import glob from pathlib import Path from typing import List -import numpy as np from tqdm import tqdm -#################### -# UTILITIES -#################### - - -def split_dataset(items): - speakers = [item[-1] for item in items] - is_multi_speaker = len(set(speakers)) > 1 - eval_split_size = min(500, int(len(items) * 0.01)) - assert eval_split_size > 0, " [!] You do not have enough samples to train. You need at least 100 samples." - np.random.seed(0) - np.random.shuffle(items) - if is_multi_speaker: - items_eval = [] - speakers = [item[-1] for item in items] - speaker_counter = Counter(speakers) - while len(items_eval) < eval_split_size: - item_idx = np.random.randint(0, len(items)) - speaker_to_be_removed = items[item_idx][-1] - if speaker_counter[speaker_to_be_removed] > 1: - items_eval.append(items[item_idx]) - speaker_counter[speaker_to_be_removed] -= 1 - del items[item_idx] - return items_eval, items - return items[:eval_split_size], items[eval_split_size:] - - -def load_meta_data(datasets, eval_split=True): - meta_data_train_all = [] - meta_data_eval_all = [] if eval_split else None - for dataset in datasets: - name = dataset["name"] - root_path = dataset["path"] - meta_file_train = dataset["meta_file_train"] - meta_file_val = dataset["meta_file_val"] - # setup the right data processor - preprocessor = get_preprocessor_by_name(name) - # load train set - meta_data_train = preprocessor(root_path, meta_file_train) - print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}") - # load evaluation split if set - if eval_split: - if meta_file_val: - meta_data_eval = preprocessor(root_path, meta_file_val) - else: - meta_data_eval, meta_data_train = split_dataset(meta_data_train) - meta_data_eval_all += meta_data_eval - meta_data_train_all += meta_data_train - # load attention masks for duration predictor training - if dataset.meta_file_attn_mask: - meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"])) - for idx, ins in enumerate(meta_data_train_all): - attn_file = meta_data[ins[1]].strip() - meta_data_train_all[idx].append(attn_file) - if meta_data_eval_all: - for idx, ins in enumerate(meta_data_eval_all): - attn_file = meta_data[ins[1]].strip() - meta_data_eval_all[idx].append(attn_file) - return meta_data_train_all, meta_data_eval_all - - -def load_attention_mask_meta_data(metafile_path): - """Load meta data file created by compute_attention_masks.py""" - with open(metafile_path, "r") as f: - lines = f.readlines() - - meta_data = [] - for line in lines: - wav_file, attn_file = line.split("|") - meta_data.append([wav_file, attn_file]) - return meta_data - - -def get_preprocessor_by_name(name): - """Returns the respective preprocessing function.""" - thismodule = sys.modules[__name__] - return getattr(thismodule, name.lower()) - ######################## # DATASETS diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 5e6acd1d..3fc63e26 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -30,16 +30,16 @@ def init_arguments(argv): parser.add_argument( "--continue_path", type=str, - help=( - "Training output folder to continue training. Used to continue " - "a training. If it is used, 'config_path' is ignored." - ), + help=("Training output folder to continue training. Used to continue " + "a training. If it is used, 'config_path' is ignored."), default="", required="--config_path" not in argv, ) parser.add_argument( - "--restore_path", type=str, help="Model file to be restored. Use to finetune a model.", default="" - ) + "--restore_path", + type=str, + help="Model file to be restored. Use to finetune a model.", + default="") parser.add_argument( "--best_path", type=str, @@ -49,12 +49,23 @@ def init_arguments(argv): ), default="", ) + parser.add_argument("--config_path", + type=str, + help="Path to config file for training.", + required="--continue_path" not in argv) + parser.add_argument("--debug", + type=bool, + default=False, + help="Do not verify commit integrity to run training.") parser.add_argument( - "--config_path", type=str, help="Path to config file for training.", required="--continue_path" not in argv - ) - parser.add_argument("--debug", type=bool, default=False, help="Do not verify commit integrity to run training.") - parser.add_argument("--rank", type=int, default=0, help="DISTRIBUTED: process rank for distributed training.") - parser.add_argument("--group_id", type=str, default="", help="DISTRIBUTED: process group id.") + "--rank", + type=int, + default=0, + help="DISTRIBUTED: process rank for distributed training.") + parser.add_argument("--group_id", + type=str, + default="", + help="DISTRIBUTED: process group id.") return parser @@ -149,7 +160,8 @@ def process_args(args): print(" > Mixed precision mode is ON") experiment_path = args.continue_path if not experiment_path: - experiment_path = create_experiment_folder(config.output_path, config.run_name, args.debug) + experiment_path = create_experiment_folder(config.output_path, + config.run_name, args.debug) audio_path = os.path.join(experiment_path, "test_audios") # setup rank 0 process in distributed training tb_logger = None @@ -170,7 +182,8 @@ def process_args(args): os.chmod(experiment_path, 0o775) tb_logger = TensorboardLogger(experiment_path, model_name=config.model) # write model desc to tensorboard - tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) + tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", + 0) c_logger = ConsoleLogger() return config, experiment_path, audio_path, c_logger, tb_logger From c61486b1e39dc2cb792f3dd3bed6ff638fe523aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 10:38:44 +0200 Subject: [PATCH 007/258] `setup_loss()` in `layer/__init__.py` --- TTS/tts/layers/__init__.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/TTS/tts/layers/__init__.py b/TTS/tts/layers/__init__.py index e69de29b..78f56a5d 100644 --- a/TTS/tts/layers/__init__.py +++ b/TTS/tts/layers/__init__.py @@ -0,0 +1,15 @@ +from TTS.tts.layers.losses import * + + +def setup_loss(config): + if config.model.lower() in ["tacotron", "tacotron2"]: + model = TacotronLoss(config) + elif config.model.lower() == "glow_tts": + model = GlowTTSLoss() + elif config.model.lower() == "speedy_speech": + model = SpeedySpeechLoss(config) + elif config.model.lower() == "align_tts": + model = AlignTTSLoss(config) + else: + raise ValueError(f" [!] loss for model {config.model.lower()} cannot be found.") + return model From 86edf6ab0e768e530698364d32eef46fdd24d6e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 14:38:31 +0200 Subject: [PATCH 008/258] add sequence_mask to `utils.data` --- TTS/tts/layers/losses.py | 2 +- TTS/tts/utils/data.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 729a21af..27c6e9e5 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -5,7 +5,7 @@ import torch from torch import nn from torch.nn import functional -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask from TTS.tts.utils.ssim import ssim diff --git a/TTS/tts/utils/data.py b/TTS/tts/utils/data.py index 259a32d9..5f8624e6 100644 --- a/TTS/tts/utils/data.py +++ b/TTS/tts/utils/data.py @@ -1,3 +1,4 @@ +import torch import numpy as np @@ -65,3 +66,12 @@ class StandardScaler: X *= self.scale_ X += self.mean_ return X + + +# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1 +def sequence_mask(sequence_length, max_len=None): + if max_len is None: + max_len = sequence_length.data.max() + seq_range = torch.arange(max_len, dtype=sequence_length.dtype, device=sequence_length.device) + # B x T_max + return seq_range.unsqueeze(0) < sequence_length.unsqueeze(1) From c98149d488220ea123443acaddc1d16189667b6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 10:40:23 +0200 Subject: [PATCH 009/258] mode `setup_model()` to `models/__init__.py` --- TTS/tts/models/__init__.py | 108 +++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py index e69de29b..153f8d43 100644 --- a/TTS/tts/models/__init__.py +++ b/TTS/tts/models/__init__.py @@ -0,0 +1,108 @@ +from TTS.utils.generic_utils import find_module + + +def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): + print(" > Using model: {}".format(c.model)) + MyModel = find_module("TTS.tts.models", c.model.lower()) + if c.model.lower() in "tacotron": + model = MyModel( + num_chars=num_chars + getattr(c, "add_blank", False), + num_speakers=num_speakers, + r=c.r, + postnet_output_dim=int(c.audio["fft_size"] / 2 + 1), + decoder_output_dim=c.audio["num_mels"], + use_gst=c.use_gst, + gst=c.gst, + memory_size=c.memory_size, + attn_type=c.attention_type, + attn_win=c.windowing, + attn_norm=c.attention_norm, + prenet_type=c.prenet_type, + prenet_dropout=c.prenet_dropout, + prenet_dropout_at_inference=c.prenet_dropout_at_inference, + forward_attn=c.use_forward_attn, + trans_agent=c.transition_agent, + forward_attn_mask=c.forward_attn_mask, + location_attn=c.location_attn, + attn_K=c.attention_heads, + separate_stopnet=c.separate_stopnet, + bidirectional_decoder=c.bidirectional_decoder, + double_decoder_consistency=c.double_decoder_consistency, + ddc_r=c.ddc_r, + speaker_embedding_dim=speaker_embedding_dim, + ) + elif c.model.lower() == "tacotron2": + model = MyModel( + num_chars=num_chars + getattr(c, "add_blank", False), + num_speakers=num_speakers, + r=c.r, + postnet_output_dim=c.audio["num_mels"], + decoder_output_dim=c.audio["num_mels"], + use_gst=c.use_gst, + gst=c.gst, + attn_type=c.attention_type, + attn_win=c.windowing, + attn_norm=c.attention_norm, + prenet_type=c.prenet_type, + prenet_dropout=c.prenet_dropout, + prenet_dropout_at_inference=c.prenet_dropout_at_inference, + forward_attn=c.use_forward_attn, + trans_agent=c.transition_agent, + forward_attn_mask=c.forward_attn_mask, + location_attn=c.location_attn, + attn_K=c.attention_heads, + separate_stopnet=c.separate_stopnet, + bidirectional_decoder=c.bidirectional_decoder, + double_decoder_consistency=c.double_decoder_consistency, + ddc_r=c.ddc_r, + speaker_embedding_dim=speaker_embedding_dim, + ) + elif c.model.lower() == "glow_tts": + model = MyModel( + num_chars=num_chars + getattr(c, "add_blank", False), + hidden_channels_enc=c["hidden_channels_encoder"], + hidden_channels_dec=c["hidden_channels_decoder"], + hidden_channels_dp=c["hidden_channels_duration_predictor"], + out_channels=c.audio["num_mels"], + encoder_type=c.encoder_type, + encoder_params=c.encoder_params, + use_encoder_prenet=c["use_encoder_prenet"], + inference_noise_scale=c.inference_noise_scale, + num_flow_blocks_dec=12, + kernel_size_dec=5, + dilation_rate=1, + num_block_layers=4, + dropout_p_dec=0.05, + num_speakers=num_speakers, + c_in_channels=0, + num_splits=4, + num_squeeze=2, + sigmoid_scale=False, + mean_only=True, + speaker_embedding_dim=speaker_embedding_dim, + ) + elif c.model.lower() == "speedy_speech": + model = MyModel( + num_chars=num_chars + getattr(c, "add_blank", False), + out_channels=c.audio["num_mels"], + hidden_channels=c["hidden_channels"], + positional_encoding=c["positional_encoding"], + encoder_type=c["encoder_type"], + encoder_params=c["encoder_params"], + decoder_type=c["decoder_type"], + decoder_params=c["decoder_params"], + c_in_channels=0, + ) + elif c.model.lower() == "align_tts": + model = MyModel( + num_chars=num_chars + getattr(c, "add_blank", False), + out_channels=c.audio["num_mels"], + hidden_channels=c["hidden_channels"], + hidden_channels_dp=c["hidden_channels_dp"], + encoder_type=c["encoder_type"], + encoder_params=c["encoder_params"], + decoder_type=c["decoder_type"], + decoder_params=c["decoder_params"], + c_in_channels=0, + ) + return model From 118a7f2b43f3178d25107027e1341e2651811251 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 10:41:22 +0200 Subject: [PATCH 010/258] import missings for tacotron.py --- TTS/tts/models/tacotron.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 89d98e9f..4413b015 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -2,6 +2,8 @@ import torch from torch import nn +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.tts.layers.tacotron.gst_layers import GST from TTS.tts.layers.tacotron.tacotron import Decoder, Encoder, PostCBHG from TTS.tts.models.tacotron_abstract import TacotronAbstract From d6b6a15b5cb53e257b19d959a8e527ffc371e299 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 10:42:16 +0200 Subject: [PATCH 011/258] add `gradual_training` argument to tacotron.py --- TTS/tts/models/tacotron.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 4413b015..c1d95a25 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -47,8 +47,9 @@ class Tacotron(TacotronAbstract): gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. memory_size (int, optional): size of the history queue fed to the prenet. Model feeds the last ```memory_size``` output frames to the prenet. + gradual_trainin (List): Gradual training schedule. If None or `[]`, no gradual training is used. + Defaults to `[]`. """ - def __init__( self, num_chars, @@ -77,6 +78,7 @@ class Tacotron(TacotronAbstract): use_gst=False, gst=None, memory_size=5, + gradual_training=[] ): super().__init__( num_chars, @@ -104,6 +106,7 @@ class Tacotron(TacotronAbstract): speaker_embedding_dim, use_gst, gst, + gradual_training ) # speaker embedding layers From 2ab723cd1048f4e126f2947d034ad20b159f724f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 14:36:06 +0200 Subject: [PATCH 012/258] update Tacotron models for the trainer --- TTS/tts/configs/tacotron_config.py | 1 + TTS/tts/models/tacotron.py | 198 ++++++++++++++---- TTS/tts/models/tacotron2.py | 308 +++++++++++++++++----------- TTS/tts/models/tacotron_abstract.py | 26 ++- 4 files changed, 373 insertions(+), 160 deletions(-) diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index 2fc7cc78..90decaa3 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -126,6 +126,7 @@ class TacotronConfig(BaseTTSConfig): use_gst: bool = False gst: GSTConfig = None gst_style_input: str = None + # model specific params r: int = 2 gradual_training: List[List[int]] = None diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index c1d95a25..23bd839f 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -113,7 +113,8 @@ class Tacotron(TacotronAbstract): if self.num_speakers > 1: if not self.embeddings_per_sample: speaker_embedding_dim = 256 - self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) + self.speaker_embedding = nn.Embedding(self.num_speakers, + speaker_embedding_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) # speaker and gst embeddings is concat in decoder input @@ -144,7 +145,8 @@ class Tacotron(TacotronAbstract): separate_stopnet, ) self.postnet = PostCBHG(decoder_output_dim) - self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, postnet_output_dim) + self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, + postnet_output_dim) # setup prenet dropout self.decoder.prenet.dropout_at_inference = prenet_dropout_at_inference @@ -181,93 +183,203 @@ class Tacotron(TacotronAbstract): separate_stopnet, ) - def forward(self, characters, text_lengths, mel_specs, mel_lengths=None, speaker_ids=None, speaker_embeddings=None): + def forward(self, + text, + text_lengths, + mel_specs=None, + mel_lengths=None, + cond_input=None): """ Shapes: - characters: [B, T_in] + text: [B, T_in] text_lengths: [B] mel_specs: [B, T_out, C] mel_lengths: [B] - speaker_ids: [B, 1] - speaker_embeddings: [B, C] + cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] """ + outputs = { + 'alignments_backward': None, + 'decoder_outputs_backward': None + } input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) # B x T_in x embed_dim - inputs = self.embedding(characters) + inputs = self.embedding(text) # B x T_in x encoder_in_features encoder_outputs = self.encoder(inputs) # sequence masking - encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) + encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as( + encoder_outputs) # global style token if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, speaker_embeddings) + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, + cond_input['x_vectors']) # speaker embedding if self.num_speakers > 1: if not self.embeddings_per_sample: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + speaker_embeddings = self.speaker_embedding(cond_input['speaker_ids'])[:, + None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + speaker_embeddings = torch.unsqueeze(cond_input['x_vectors'], 1) + encoder_outputs = self._concat_speaker_embedding( + encoder_outputs, speaker_embeddings) # decoder_outputs: B x decoder_in_features x T_out # alignments: B x T_in x encoder_in_features # stop_tokens: B x T_in - decoder_outputs, alignments, stop_tokens = self.decoder(encoder_outputs, mel_specs, input_mask) + decoder_outputs, alignments, stop_tokens = self.decoder( + encoder_outputs, mel_specs, input_mask) # sequence masking if output_mask is not None: - decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs) + decoder_outputs = decoder_outputs * output_mask.unsqueeze( + 1).expand_as(decoder_outputs) # B x T_out x decoder_in_features postnet_outputs = self.postnet(decoder_outputs) # sequence masking if output_mask is not None: - postnet_outputs = postnet_outputs * output_mask.unsqueeze(2).expand_as(postnet_outputs) + postnet_outputs = postnet_outputs * output_mask.unsqueeze( + 2).expand_as(postnet_outputs) # B x T_out x posnet_dim postnet_outputs = self.last_linear(postnet_outputs) # B x T_out x decoder_in_features decoder_outputs = decoder_outputs.transpose(1, 2).contiguous() if self.bidirectional_decoder: - decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask) - return ( - decoder_outputs, - postnet_outputs, - alignments, - stop_tokens, - decoder_outputs_backward, - alignments_backward, - ) + decoder_outputs_backward, alignments_backward = self._backward_pass( + mel_specs, encoder_outputs, input_mask) + outputs['alignments_backward'] = alignments_backward + outputs['decoder_outputs_backward'] = decoder_outputs_backward if self.double_decoder_consistency: decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass( - mel_specs, encoder_outputs, alignments, input_mask - ) - return ( - decoder_outputs, - postnet_outputs, - alignments, - stop_tokens, - decoder_outputs_backward, - alignments_backward, - ) - return decoder_outputs, postnet_outputs, alignments, stop_tokens + mel_specs, encoder_outputs, alignments, input_mask) + outputs['alignments_backward'] = alignments_backward + outputs['decoder_outputs_backward'] = decoder_outputs_backward + outputs.update({ + 'postnet_outputs': postnet_outputs, + 'decoder_outputs': decoder_outputs, + 'alignments': alignments, + 'stop_tokens': stop_tokens + }) + return outputs @torch.no_grad() - def inference(self, characters, speaker_ids=None, style_mel=None, speaker_embeddings=None): - inputs = self.embedding(characters) + def inference(self, + text_input, + cond_input=None): + inputs = self.embedding(text_input) encoder_outputs = self.encoder(inputs) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel, speaker_embeddings) + encoder_outputs = self.compute_gst(encoder_outputs, cond_input['style_mel'], + cond_input['x_vectors']) if self.num_speakers > 1: if not self.embeddings_per_sample: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + speaker_embeddings = self.speaker_embedding(cond_input['speaker_ids'])[:, None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) - decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) + speaker_embeddings = torch.unsqueeze(cond_input['x_vectors'], 1) + encoder_outputs = self._concat_speaker_embedding( + encoder_outputs, speaker_embeddings) + decoder_outputs, alignments, stop_tokens = self.decoder.inference( + encoder_outputs) postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = self.last_linear(postnet_outputs) decoder_outputs = decoder_outputs.transpose(1, 2) - return decoder_outputs, postnet_outputs, alignments, stop_tokens + outputs = { + 'postnet_outputs': postnet_outputs, + 'decoder_outputs': decoder_outputs, + 'alignments': alignments, + 'stop_tokens': stop_tokens + } + return outputs + + def train_step(self, batch, criterion): + """Perform a single training step by fetching the right set if samples from the batch. + + Args: + batch ([type]): [description] + criterion ([type]): [description] + """ + text_input = batch['text_input'] + text_lengths = batch['text_lengths'] + mel_input = batch['mel_input'] + mel_lengths = batch['mel_lengths'] + linear_input = batch['linear_input'] + stop_targets = batch['stop_targets'] + speaker_ids = batch['speaker_ids'] + x_vectors = batch['x_vectors'] + + # forward pass model + outputs = self.forward(text_input, + text_lengths, + mel_input, + mel_lengths, + cond_input={ + 'speaker_ids': speaker_ids, + 'x_vectors': x_vectors + }) + + # set the [alignment] lengths wrt reduction factor for guided attention + if mel_lengths.max() % self.decoder.r != 0: + alignment_lengths = ( + mel_lengths + + (self.decoder.r - + (mel_lengths.max() % self.decoder.r))) // self.decoder.r + else: + alignment_lengths = mel_lengths // self.decoder.r + + cond_input = {'speaker_ids': speaker_ids, 'x_vectors': x_vectors} + outputs = self.forward(text_input, text_lengths, mel_input, + mel_lengths, cond_input) + + # compute loss + loss_dict = criterion( + outputs['postnet_outputs'], + outputs['decoder_outputs'], + mel_input, + linear_input, + outputs['stop_tokens'], + stop_targets, + mel_lengths, + outputs['decoder_outputs_backward'], + outputs['alignments'], + alignment_lengths, + outputs['alignments_backward'], + text_lengths, + ) + + # compute alignment error (the lower the better ) + align_error = 1 - alignment_diagonal_score(outputs['alignments']) + loss_dict["align_error"] = align_error + return outputs, loss_dict + + def train_log(self, ap, batch, outputs): + postnet_outputs = outputs['postnet_outputs'] + alignments = outputs['alignments'] + alignments_backward = outputs['alignments_backward'] + mel_input = batch['mel_input'] + + pred_spec = postnet_outputs[0].data.cpu().numpy() + gt_spec = mel_input[0].data.cpu().numpy() + align_img = alignments[0].data.cpu().numpy() + + figures = { + "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), + "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), + "alignment": plot_alignment(align_img, output_fig=False), + } + + if self.bidirectional_decoder or self.double_decoder_consistency: + figures["alignment_backward"] = plot_alignment( + alignments_backward[0].data.cpu().numpy(), output_fig=False) + + # Sample audio + train_audio = ap.inv_spectrogram(pred_spec.T) + return figures, train_audio + + def eval_step(self, batch, criterion): + return self.train_step(batch, criterion) + + def eval_log(self, ap, batch, outputs): + return self.train_log(ap, batch, outputs) \ No newline at end of file diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 525eb8b3..51b181e4 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -1,12 +1,15 @@ +# coding: utf-8 +import numpy as np import torch from torch import nn +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.tts.layers.tacotron.gst_layers import GST from TTS.tts.layers.tacotron.tacotron2 import Decoder, Encoder, Postnet from TTS.tts.models.tacotron_abstract import TacotronAbstract -# TODO: match function arguments with tacotron class Tacotron2(TacotronAbstract): """Tacotron2 as in https://arxiv.org/abs/1712.05884 @@ -43,69 +46,52 @@ class Tacotron2(TacotronAbstract): speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None. use_gst (bool, optional): enable/disable Global style token module. gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. + gradual_trainin (List): Gradual training schedule. If None or `[]`, no gradual training is used. + Defaults to `[]`. """ - - def __init__( - self, - num_chars, - num_speakers, - r, - postnet_output_dim=80, - decoder_output_dim=80, - attn_type="original", - attn_win=False, - attn_norm="softmax", - prenet_type="original", - prenet_dropout=True, - prenet_dropout_at_inference=False, - forward_attn=False, - trans_agent=False, - forward_attn_mask=False, - location_attn=True, - attn_K=5, - separate_stopnet=True, - bidirectional_decoder=False, - double_decoder_consistency=False, - ddc_r=None, - encoder_in_features=512, - decoder_in_features=512, - speaker_embedding_dim=None, - use_gst=False, - gst=None, - ): - super().__init__( - num_chars, - num_speakers, - r, - postnet_output_dim, - decoder_output_dim, - attn_type, - attn_win, - attn_norm, - prenet_type, - prenet_dropout, - prenet_dropout_at_inference, - forward_attn, - trans_agent, - forward_attn_mask, - location_attn, - attn_K, - separate_stopnet, - bidirectional_decoder, - double_decoder_consistency, - ddc_r, - encoder_in_features, - decoder_in_features, - speaker_embedding_dim, - use_gst, - gst, - ) + def __init__(self, + num_chars, + num_speakers, + r, + postnet_output_dim=80, + decoder_output_dim=80, + attn_type="original", + attn_win=False, + attn_norm="softmax", + prenet_type="original", + prenet_dropout=True, + prenet_dropout_at_inference=False, + forward_attn=False, + trans_agent=False, + forward_attn_mask=False, + location_attn=True, + attn_K=5, + separate_stopnet=True, + bidirectional_decoder=False, + double_decoder_consistency=False, + ddc_r=None, + encoder_in_features=512, + decoder_in_features=512, + speaker_embedding_dim=None, + use_gst=False, + gst=None, + gradual_training=[]): + super().__init__(num_chars, num_speakers, r, postnet_output_dim, + decoder_output_dim, attn_type, attn_win, attn_norm, + prenet_type, prenet_dropout, + prenet_dropout_at_inference, forward_attn, + trans_agent, forward_attn_mask, location_attn, attn_K, + separate_stopnet, bidirectional_decoder, + double_decoder_consistency, ddc_r, + encoder_in_features, decoder_in_features, + speaker_embedding_dim, use_gst, gst, gradual_training) # speaker embedding layer if self.num_speakers > 1: if not self.embeddings_per_sample: speaker_embedding_dim = 512 - self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) + self.speaker_embedding = nn.Embedding(self.num_speakers, + speaker_embedding_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) # speaker and gst embeddings is concat in decoder input @@ -176,16 +162,24 @@ class Tacotron2(TacotronAbstract): mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2) return mel_outputs, mel_outputs_postnet, alignments - def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None, speaker_embeddings=None): + def forward(self, + text, + text_lengths, + mel_specs=None, + mel_lengths=None, + cond_input=None): """ Shapes: text: [B, T_in] text_lengths: [B] mel_specs: [B, T_out, C] mel_lengths: [B] - speaker_ids: [B, 1] - speaker_embeddings: [B, C] + cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] """ + outputs = { + 'alignments_backward': None, + 'decoder_outputs_backward': None + } # compute mask for padding # B x T_in_max (boolean) input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) @@ -195,94 +189,176 @@ class Tacotron2(TacotronAbstract): encoder_outputs = self.encoder(embedded_inputs, text_lengths) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, speaker_embeddings) + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, + cond_input['x_vectors']) if self.num_speakers > 1: if not self.embeddings_per_sample: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + speaker_embeddings = self.speaker_embedding(cond_input['speaker_ids'])[:, + None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + speaker_embeddings = torch.unsqueeze(cond_input['x_vectors'], 1) + encoder_outputs = self._concat_speaker_embedding( + encoder_outputs, speaker_embeddings) - encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) + encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as( + encoder_outputs) # B x mel_dim x T_out -- B x T_out//r x T_in -- B x T_out//r - decoder_outputs, alignments, stop_tokens = self.decoder(encoder_outputs, mel_specs, input_mask) + decoder_outputs, alignments, stop_tokens = self.decoder( + encoder_outputs, mel_specs, input_mask) # sequence masking if mel_lengths is not None: - decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs) + decoder_outputs = decoder_outputs * output_mask.unsqueeze( + 1).expand_as(decoder_outputs) # B x mel_dim x T_out postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = decoder_outputs + postnet_outputs # sequence masking if output_mask is not None: - postnet_outputs = postnet_outputs * output_mask.unsqueeze(1).expand_as(postnet_outputs) + postnet_outputs = postnet_outputs * output_mask.unsqueeze( + 1).expand_as(postnet_outputs) # B x T_out x mel_dim -- B x T_out x mel_dim -- B x T_out//r x T_in - decoder_outputs, postnet_outputs, alignments = self.shape_outputs(decoder_outputs, postnet_outputs, alignments) + decoder_outputs, postnet_outputs, alignments = self.shape_outputs( + decoder_outputs, postnet_outputs, alignments) if self.bidirectional_decoder: - decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask) - return ( - decoder_outputs, - postnet_outputs, - alignments, - stop_tokens, - decoder_outputs_backward, - alignments_backward, - ) + decoder_outputs_backward, alignments_backward = self._backward_pass( + mel_specs, encoder_outputs, input_mask) + outputs['alignments_backward'] = alignments_backward + outputs['decoder_outputs_backward'] = decoder_outputs_backward if self.double_decoder_consistency: decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass( - mel_specs, encoder_outputs, alignments, input_mask - ) - return ( - decoder_outputs, - postnet_outputs, - alignments, - stop_tokens, - decoder_outputs_backward, - alignments_backward, - ) - return decoder_outputs, postnet_outputs, alignments, stop_tokens + mel_specs, encoder_outputs, alignments, input_mask) + outputs['alignments_backward'] = alignments_backward + outputs['decoder_outputs_backward'] = decoder_outputs_backward + outputs.update({ + 'postnet_outputs': postnet_outputs, + 'decoder_outputs': decoder_outputs, + 'alignments': alignments, + 'stop_tokens': stop_tokens + }) + return outputs @torch.no_grad() - def inference(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None): + def inference(self, text, cond_input=None): embedded_inputs = self.embedding(text).transpose(1, 2) encoder_outputs = self.encoder.inference(embedded_inputs) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel, speaker_embeddings) + encoder_outputs = self.compute_gst(encoder_outputs, cond_input['style_mel'], + cond_input['x_vectors']) if self.num_speakers > 1: if not self.embeddings_per_sample: - speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] - speaker_embeddings = torch.unsqueeze(speaker_embeddings, 0).transpose(1, 2) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + x_vector = self.speaker_embedding(cond_input['speaker_ids'])[:, None] + x_vector = torch.unsqueeze(x_vector, 0).transpose(1, 2) + else: + x_vector = cond_input - decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) + encoder_outputs = self._concat_speaker_embedding( + encoder_outputs, x_vector) + + decoder_outputs, alignments, stop_tokens = self.decoder.inference( + encoder_outputs) postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = decoder_outputs + postnet_outputs - decoder_outputs, postnet_outputs, alignments = self.shape_outputs(decoder_outputs, postnet_outputs, alignments) - return decoder_outputs, postnet_outputs, alignments, stop_tokens + decoder_outputs, postnet_outputs, alignments = self.shape_outputs( + decoder_outputs, postnet_outputs, alignments) + outputs = { + 'postnet_outputs': postnet_outputs, + 'decoder_outputs': decoder_outputs, + 'alignments': alignments, + 'stop_tokens': stop_tokens + } + return outputs - def inference_truncated(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None): + def train_step(self, batch, criterion): + """Perform a single training step by fetching the right set if samples from the batch. + + Args: + batch ([type]): [description] + criterion ([type]): [description] """ - Preserve model states for continuous inference - """ - embedded_inputs = self.embedding(text).transpose(1, 2) - encoder_outputs = self.encoder.inference_truncated(embedded_inputs) + text_input = batch['text_input'] + text_lengths = batch['text_lengths'] + mel_input = batch['mel_input'] + mel_lengths = batch['mel_lengths'] + linear_input = batch['linear_input'] + stop_targets = batch['stop_targets'] + speaker_ids = batch['speaker_ids'] + x_vectors = batch['x_vectors'] - if self.gst: - # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel, speaker_embeddings) + # forward pass model + outputs = self.forward(text_input, + text_lengths, + mel_input, + mel_lengths, + cond_input={ + 'speaker_ids': speaker_ids, + 'x_vectors': x_vectors + }) - if self.num_speakers > 1: - if not self.embeddings_per_sample: - speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] - speaker_embeddings = torch.unsqueeze(speaker_embeddings, 0).transpose(1, 2) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + # set the [alignment] lengths wrt reduction factor for guided attention + if mel_lengths.max() % self.decoder.r != 0: + alignment_lengths = ( + mel_lengths + + (self.decoder.r - + (mel_lengths.max() % self.decoder.r))) // self.decoder.r + else: + alignment_lengths = mel_lengths // self.decoder.r - mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated(encoder_outputs) - mel_outputs_postnet = self.postnet(mel_outputs) - mel_outputs_postnet = mel_outputs + mel_outputs_postnet - mel_outputs, mel_outputs_postnet, alignments = self.shape_outputs(mel_outputs, mel_outputs_postnet, alignments) - return mel_outputs, mel_outputs_postnet, alignments, stop_tokens + cond_input = {'speaker_ids': speaker_ids, 'x_vectors': x_vectors} + outputs = self.forward(text_input, text_lengths, mel_input, + mel_lengths, cond_input) + + # compute loss + loss_dict = criterion( + outputs['model_outputs'], + outputs['decoder_outputs'], + mel_input, + linear_input, + outputs['stop_tokens'], + stop_targets, + mel_lengths, + outputs['decoder_outputs_backward'], + outputs['alignments'], + alignment_lengths, + outputs['alignments_backward'], + text_lengths, + ) + + # compute alignment error (the lower the better ) + align_error = 1 - alignment_diagonal_score(outputs['alignments']) + loss_dict["align_error"] = align_error + return outputs, loss_dict + + def train_log(self, ap, batch, outputs): + postnet_outputs = outputs['model_outputs'] + alignments = outputs['alignments'] + alignments_backward = outputs['alignments_backward'] + mel_input = batch['mel_input'] + + pred_spec = postnet_outputs[0].data.cpu().numpy() + gt_spec = mel_input[0].data.cpu().numpy() + align_img = alignments[0].data.cpu().numpy() + + figures = { + "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), + "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), + "alignment": plot_alignment(align_img, output_fig=False), + } + + if self.bidirectional_decoder or self.double_decoder_consistency: + figures["alignment_backward"] = plot_alignment( + alignments_backward[0].data.cpu().numpy(), output_fig=False) + + # Sample audio + train_audio = ap.inv_melspectrogram(pred_spec.T) + return figures, train_audio + + def eval_step(self, batch, criterion): + return self.train_step(batch, criterion) + + def eval_log(self, ap, batch, outputs): + return self.train_log(ap, batch, outputs) diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py index e684ce7c..2bea06a9 100644 --- a/TTS/tts/models/tacotron_abstract.py +++ b/TTS/tts/models/tacotron_abstract.py @@ -1,10 +1,12 @@ import copy +import logging from abc import ABC, abstractmethod import torch from torch import nn -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask +from TTS.utils.training import gradual_training_scheduler class TacotronAbstract(ABC, nn.Module): @@ -35,6 +37,7 @@ class TacotronAbstract(ABC, nn.Module): speaker_embedding_dim=None, use_gst=False, gst=None, + gradual_training=[] ): """Abstract Tacotron class""" super().__init__() @@ -63,6 +66,7 @@ class TacotronAbstract(ABC, nn.Module): self.encoder_in_features = encoder_in_features self.decoder_in_features = decoder_in_features self.speaker_embedding_dim = speaker_embedding_dim + self.gradual_training = gradual_training # layers self.embedding = None @@ -216,3 +220,23 @@ class TacotronAbstract(ABC, nn.Module): speaker_embeddings_ = speaker_embeddings.expand(outputs.size(0), outputs.size(1), -1) outputs = torch.cat([outputs, speaker_embeddings_], dim=-1) return outputs + + ############################# + # CALLBACKS + ############################# + + def on_epoch_start(self, trainer): + """Callback for setting values wrt gradual training schedule. + + Args: + trainer (TrainerTTS): TTS trainer object that is used to train this model. + """ + if self.gradual_training: + r, trainer.config.batch_size = gradual_training_scheduler(trainer.total_steps_done, trainer.config) + trainer.config.r = r + self.decoder.set_r(r) + if trainer.config.bidirectional_decoder: + trainer.model.decoder_backward.set_r(r) + trainer.train_loader = trainer.setup_train_dataloader(self.ap, self.model.decoder.r, verbose=True) + trainer.eval_loader = trainer.setup_eval_dataloder(self.ap, self.model.decoder.r) + logging.info(f"\n > Number of output frames: {self.decoder.r}") From c9e552707073658c3b9eba1b421121192c7627bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 14:38:54 +0200 Subject: [PATCH 013/258] remove `tts.generic_utils` as all the functions are moved to other files --- TTS/tts/utils/generic_utils.py | 278 --------------------------------- 1 file changed, 278 deletions(-) delete mode 100644 TTS/tts/utils/generic_utils.py diff --git a/TTS/tts/utils/generic_utils.py b/TTS/tts/utils/generic_utils.py deleted file mode 100644 index b0e53f33..00000000 --- a/TTS/tts/utils/generic_utils.py +++ /dev/null @@ -1,278 +0,0 @@ -import torch - -from TTS.utils.generic_utils import find_module - - -# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1 -def sequence_mask(sequence_length, max_len=None): - if max_len is None: - max_len = sequence_length.data.max() - seq_range = torch.arange(max_len, dtype=sequence_length.dtype, device=sequence_length.device) - # B x T_max - return seq_range.unsqueeze(0) < sequence_length.unsqueeze(1) - - -def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): - print(" > Using model: {}".format(c.model)) - MyModel = find_module("TTS.tts.models", c.model.lower()) - if c.model.lower() in "tacotron": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - num_speakers=num_speakers, - r=c.r, - postnet_output_dim=int(c.audio["fft_size"] / 2 + 1), - decoder_output_dim=c.audio["num_mels"], - use_gst=c.use_gst, - gst=c.gst, - memory_size=c.memory_size, - attn_type=c.attention_type, - attn_win=c.windowing, - attn_norm=c.attention_norm, - prenet_type=c.prenet_type, - prenet_dropout=c.prenet_dropout, - prenet_dropout_at_inference=c.prenet_dropout_at_inference, - forward_attn=c.use_forward_attn, - trans_agent=c.transition_agent, - forward_attn_mask=c.forward_attn_mask, - location_attn=c.location_attn, - attn_K=c.attention_heads, - separate_stopnet=c.separate_stopnet, - bidirectional_decoder=c.bidirectional_decoder, - double_decoder_consistency=c.double_decoder_consistency, - ddc_r=c.ddc_r, - speaker_embedding_dim=speaker_embedding_dim, - ) - elif c.model.lower() == "tacotron2": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - num_speakers=num_speakers, - r=c.r, - postnet_output_dim=c.audio["num_mels"], - decoder_output_dim=c.audio["num_mels"], - use_gst=c.use_gst, - gst=c.gst, - attn_type=c.attention_type, - attn_win=c.windowing, - attn_norm=c.attention_norm, - prenet_type=c.prenet_type, - prenet_dropout=c.prenet_dropout, - prenet_dropout_at_inference=c.prenet_dropout_at_inference, - forward_attn=c.use_forward_attn, - trans_agent=c.transition_agent, - forward_attn_mask=c.forward_attn_mask, - location_attn=c.location_attn, - attn_K=c.attention_heads, - separate_stopnet=c.separate_stopnet, - bidirectional_decoder=c.bidirectional_decoder, - double_decoder_consistency=c.double_decoder_consistency, - ddc_r=c.ddc_r, - speaker_embedding_dim=speaker_embedding_dim, - ) - elif c.model.lower() == "glow_tts": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - hidden_channels_enc=c["hidden_channels_encoder"], - hidden_channels_dec=c["hidden_channels_decoder"], - hidden_channels_dp=c["hidden_channels_duration_predictor"], - out_channels=c.audio["num_mels"], - encoder_type=c.encoder_type, - encoder_params=c.encoder_params, - use_encoder_prenet=c["use_encoder_prenet"], - inference_noise_scale=c.inference_noise_scale, - num_flow_blocks_dec=12, - kernel_size_dec=5, - dilation_rate=1, - num_block_layers=4, - dropout_p_dec=0.05, - num_speakers=num_speakers, - c_in_channels=0, - num_splits=4, - num_squeeze=2, - sigmoid_scale=False, - mean_only=True, - speaker_embedding_dim=speaker_embedding_dim, - ) - elif c.model.lower() == "speedy_speech": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - out_channels=c.audio["num_mels"], - hidden_channels=c["hidden_channels"], - positional_encoding=c["positional_encoding"], - encoder_type=c["encoder_type"], - encoder_params=c["encoder_params"], - decoder_type=c["decoder_type"], - decoder_params=c["decoder_params"], - c_in_channels=0, - ) - elif c.model.lower() == "align_tts": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - out_channels=c.audio["num_mels"], - hidden_channels=c["hidden_channels"], - hidden_channels_dp=c["hidden_channels_dp"], - encoder_type=c["encoder_type"], - encoder_params=c["encoder_params"], - decoder_type=c["decoder_type"], - decoder_params=c["decoder_params"], - c_in_channels=0, - ) - return model - - -def is_tacotron(c): - return "tacotron" in c["model"].lower() - - -# def check_config_tts(c): -# check_argument('model', c, enum_list=['tacotron', 'tacotron2', 'glow_tts', 'speedy_speech', 'align_tts'], restricted=True, val_type=str) -# check_argument('run_name', c, restricted=True, val_type=str) -# check_argument('run_description', c, val_type=str) - -# # AUDIO -# # check_argument('audio', c, restricted=True, val_type=dict) - -# # audio processing parameters -# # check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056) -# # check_argument('fft_size', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058) -# # check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000) -# # check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length') -# # check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length') -# # check_argument('preemphasis', c['audio'], restricted=True, val_type=float, min_val=0, max_val=1) -# # check_argument('min_level_db', c['audio'], restricted=True, val_type=int, min_val=-1000, max_val=10) -# # check_argument('ref_level_db', c['audio'], restricted=True, val_type=int, min_val=0, max_val=1000) -# # check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5) -# # check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000) - -# # vocabulary parameters -# check_argument('characters', c, restricted=False, val_type=dict) -# check_argument('pad', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) -# check_argument('eos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) -# check_argument('bos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) -# check_argument('characters', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) -# check_argument('phonemes', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys() and c['use_phonemes'], val_type=str) -# check_argument('punctuations', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) - -# # normalization parameters -# # check_argument('signal_norm', c['audio'], restricted=True, val_type=bool) -# # check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool) -# # check_argument('max_norm', c['audio'], restricted=True, val_type=float, min_val=0.1, max_val=1000) -# # check_argument('clip_norm', c['audio'], restricted=True, val_type=bool) -# # check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000) -# # check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0) -# # check_argument('spec_gain', c['audio'], restricted=True, val_type=[int, float], min_val=1, max_val=100) -# # check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool) -# # check_argument('trim_db', c['audio'], restricted=True, val_type=int) - -# # training parameters -# # check_argument('batch_size', c, restricted=True, val_type=int, min_val=1) -# # check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1) -# # check_argument('r', c, restricted=True, val_type=int, min_val=1) -# # check_argument('gradual_training', c, restricted=False, val_type=list) -# # check_argument('mixed_precision', c, restricted=False, val_type=bool) -# # check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100) - -# # loss parameters -# # check_argument('loss_masking', c, restricted=True, val_type=bool) -# # if c['model'].lower() in ['tacotron', 'tacotron2']: -# # check_argument('decoder_loss_alpha', c, restricted=True, val_type=float, min_val=0) -# # check_argument('postnet_loss_alpha', c, restricted=True, val_type=float, min_val=0) -# # check_argument('postnet_diff_spec_alpha', c, restricted=True, val_type=float, min_val=0) -# # check_argument('decoder_diff_spec_alpha', c, restricted=True, val_type=float, min_val=0) -# # check_argument('decoder_ssim_alpha', c, restricted=True, val_type=float, min_val=0) -# # check_argument('postnet_ssim_alpha', c, restricted=True, val_type=float, min_val=0) -# # check_argument('ga_alpha', c, restricted=True, val_type=float, min_val=0) -# if c['model'].lower in ["speedy_speech", "align_tts"]: -# check_argument('ssim_alpha', c, restricted=True, val_type=float, min_val=0) -# check_argument('l1_alpha', c, restricted=True, val_type=float, min_val=0) -# check_argument('huber_alpha', c, restricted=True, val_type=float, min_val=0) - -# # validation parameters -# # check_argument('run_eval', c, restricted=True, val_type=bool) -# # check_argument('test_delay_epochs', c, restricted=True, val_type=int, min_val=0) -# # check_argument('test_sentences_file', c, restricted=False, val_type=str) - -# # optimizer -# check_argument('noam_schedule', c, restricted=False, val_type=bool) -# check_argument('grad_clip', c, restricted=True, val_type=float, min_val=0.0) -# check_argument('epochs', c, restricted=True, val_type=int, min_val=1) -# check_argument('lr', c, restricted=True, val_type=float, min_val=0) -# check_argument('wd', c, restricted=is_tacotron(c), val_type=float, min_val=0) -# check_argument('warmup_steps', c, restricted=True, val_type=int, min_val=0) -# check_argument('seq_len_norm', c, restricted=is_tacotron(c), val_type=bool) - -# # tacotron prenet -# # check_argument('memory_size', c, restricted=is_tacotron(c), val_type=int, min_val=-1) -# # check_argument('prenet_type', c, restricted=is_tacotron(c), val_type=str, enum_list=['original', 'bn']) -# # check_argument('prenet_dropout', c, restricted=is_tacotron(c), val_type=bool) - -# # attention -# check_argument('attention_type', c, restricted=is_tacotron(c), val_type=str, enum_list=['graves', 'original', 'dynamic_convolution']) -# check_argument('attention_heads', c, restricted=is_tacotron(c), val_type=int) -# check_argument('attention_norm', c, restricted=is_tacotron(c), val_type=str, enum_list=['sigmoid', 'softmax']) -# check_argument('windowing', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('use_forward_attn', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('forward_attn_mask', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('transition_agent', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('transition_agent', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('location_attn', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('bidirectional_decoder', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('double_decoder_consistency', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('ddc_r', c, restricted='double_decoder_consistency' in c.keys(), min_val=1, max_val=7, val_type=int) - -# if c['model'].lower() in ['tacotron', 'tacotron2']: -# # stopnet -# # check_argument('stopnet', c, restricted=is_tacotron(c), val_type=bool) -# # check_argument('separate_stopnet', c, restricted=is_tacotron(c), val_type=bool) - -# # Model Parameters for non-tacotron models -# if c['model'].lower in ["speedy_speech", "align_tts"]: -# check_argument('positional_encoding', c, restricted=True, val_type=type) -# check_argument('encoder_type', c, restricted=True, val_type=str) -# check_argument('encoder_params', c, restricted=True, val_type=dict) -# check_argument('decoder_residual_conv_bn_params', c, restricted=True, val_type=dict) - -# # GlowTTS parameters -# check_argument('encoder_type', c, restricted=not is_tacotron(c), val_type=str) - -# # tensorboard -# # check_argument('print_step', c, restricted=True, val_type=int, min_val=1) -# # check_argument('tb_plot_step', c, restricted=True, val_type=int, min_val=1) -# # check_argument('save_step', c, restricted=True, val_type=int, min_val=1) -# # check_argument('checkpoint', c, restricted=True, val_type=bool) -# # check_argument('tb_model_param_stats', c, restricted=True, val_type=bool) - -# # dataloading -# # pylint: disable=import-outside-toplevel -# from TTS.tts.utils.text import cleaners -# # check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=dir(cleaners)) -# # check_argument('enable_eos_bos_chars', c, restricted=True, val_type=bool) -# # check_argument('num_loader_workers', c, restricted=True, val_type=int, min_val=0) -# # check_argument('num_val_loader_workers', c, restricted=True, val_type=int, min_val=0) -# # check_argument('batch_group_size', c, restricted=True, val_type=int, min_val=0) -# # check_argument('min_seq_len', c, restricted=True, val_type=int, min_val=0) -# # check_argument('max_seq_len', c, restricted=True, val_type=int, min_val=10) -# # check_argument('compute_input_seq_cache', c, restricted=True, val_type=bool) - -# # paths -# # check_argument('output_path', c, restricted=True, val_type=str) - -# # multi-speaker and gst -# # check_argument('use_speaker_embedding', c, restricted=True, val_type=bool) -# # check_argument('use_external_speaker_embedding_file', c, restricted=c['use_speaker_embedding'], val_type=bool) -# # check_argument('external_speaker_embedding_file', c, restricted=c['use_external_speaker_embedding_file'], val_type=str) -# if c['model'].lower() in ['tacotron', 'tacotron2'] and c['use_gst']: -# # check_argument('use_gst', c, restricted=is_tacotron(c), val_type=bool) -# # check_argument('gst', c, restricted=is_tacotron(c), val_type=dict) -# # check_argument('gst_style_input', c['gst'], restricted=is_tacotron(c), val_type=[str, dict]) -# # check_argument('gst_embedding_dim', c['gst'], restricted=is_tacotron(c), val_type=int, min_val=0, max_val=1000) -# # check_argument('gst_use_speaker_embedding', c['gst'], restricted=is_tacotron(c), val_type=bool) -# # check_argument('gst_num_heads', c['gst'], restricted=is_tacotron(c), val_type=int, min_val=2, max_val=10) -# # check_argument('gst_num_style_tokens', c['gst'], restricted=is_tacotron(c), val_type=int, min_val=1, max_val=1000) - -# # datasets - checking only the first entry -# # check_argument('datasets', c, restricted=True, val_type=list) -# # for dataset_entry in c['datasets']: -# # check_argument('name', dataset_entry, restricted=True, val_type=str) -# # check_argument('path', dataset_entry, restricted=True, val_type=str) -# # check_argument('meta_file_train', dataset_entry, restricted=True, val_type=[str, list]) -# # check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str) From 2ac6b824ca694c5229dee04c9a9247b41214dd59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 14:40:25 +0200 Subject: [PATCH 014/258] update `synthesis.py` for the trainer --- TTS/tts/utils/speakers.py | 8 +- TTS/tts/utils/synthesis.py | 186 +++++++++++++++++++------------------ 2 files changed, 97 insertions(+), 97 deletions(-) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 4ab78f88..374139ee 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -34,10 +34,6 @@ def save_speaker_mapping(out_path, speaker_mapping): json.dump(speaker_mapping, f, indent=4) -def get_speakers(items): - - - def parse_speakers(c, args, meta_data_train, OUT_PATH): """Returns number of speakers, speaker embedding shape and speaker mapping""" if c.use_speaker_embedding: @@ -135,7 +131,7 @@ class SpeakerManager: ): self.data_items = [] - self.x_vectors = [] + self.x_vectors = {} self.speaker_ids = [] self.clip_ids = [] self.speaker_encoder = None @@ -171,7 +167,7 @@ class SpeakerManager: def x_vector_dim(self): return len(self.x_vectors[list(self.x_vectors.keys())[0]]["embedding"]) - def parser_speakers_from_items(self, items: list): + def parse_speakers_from_items(self, items: list): speakers = sorted({item[2] for item in items}) self.speaker_ids = {name: i for i, name in enumerate(speakers)} num_speakers = len(self.speaker_ids) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 9f417a1d..4c3331c8 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -13,7 +13,7 @@ if "tensorflow" in installed or "tensorflow-gpu" in installed: import tensorflow as tf -def text_to_seqvec(text, CONFIG): +def text_to_seq(text, CONFIG): text_cleaner = [CONFIG.text_cleaner] # text ot phonemes to sequence vector if CONFIG.use_phonemes: @@ -58,81 +58,82 @@ def numpy_to_tf(np_array, dtype): def compute_style_mel(style_wav, ap, cuda=False): - style_mel = torch.FloatTensor(ap.melspectrogram(ap.load_wav(style_wav, sr=ap.sample_rate))).unsqueeze(0) + style_mel = torch.FloatTensor( + ap.melspectrogram(ap.load_wav(style_wav, + sr=ap.sample_rate))).unsqueeze(0) if cuda: return style_mel.cuda() return style_mel -def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None, speaker_embeddings=None): - if "tacotron" in CONFIG.model.lower(): - if CONFIG.gst: - decoder_output, postnet_output, alignments, stop_tokens = model.inference( - inputs, style_mel=style_mel, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings - ) - else: - if truncated: - decoder_output, postnet_output, alignments, stop_tokens = model.inference_truncated( - inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings - ) - else: - decoder_output, postnet_output, alignments, stop_tokens = model.inference( - inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings - ) - elif "glow" in CONFIG.model.lower(): - inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable - if hasattr(model, "module"): - # distributed model - postnet_output, _, _, _, alignments, _, _ = model.module.inference( - inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings - ) - else: - postnet_output, _, _, _, alignments, _, _ = model.inference( - inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings - ) - postnet_output = postnet_output.permute(0, 2, 1) - # these only belong to tacotron models. - decoder_output = None - stop_tokens = None - elif CONFIG.model.lower() in ["speedy_speech", "align_tts"]: - inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable - if hasattr(model, "module"): - # distributed model - postnet_output, alignments = model.module.inference( - inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings - ) - else: - postnet_output, alignments = model.inference( - inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings - ) - postnet_output = postnet_output.permute(0, 2, 1) - # these only belong to tacotron models. - decoder_output = None - stop_tokens = None - else: - raise ValueError("[!] Unknown model name.") - return decoder_output, postnet_output, alignments, stop_tokens +def run_model_torch(model, + inputs, + speaker_id=None, + style_mel=None, + x_vector=None): + outputs = model.inference(inputs, + cond_input={ + 'speaker_ids': speaker_id, + 'x_vector': x_vector, + 'style_mel': style_mel + }) + # elif "glow" in CONFIG.model.lower(): + # inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable + # if hasattr(model, "module"): + # # distributed model + # postnet_output, _, _, _, alignments, _, _ = model.module.inference( + # inputs, + # inputs_lengths, + # g=speaker_id if speaker_id is not None else speaker_embeddings) + # else: + # postnet_output, _, _, _, alignments, _, _ = model.inference( + # inputs, + # inputs_lengths, + # g=speaker_id if speaker_id is not None else speaker_embeddings) + # postnet_output = postnet_output.permute(0, 2, 1) + # # these only belong to tacotron models. + # decoder_output = None + # stop_tokens = None + # elif CONFIG.model.lower() in ["speedy_speech", "align_tts"]: + # inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable + # if hasattr(model, "module"): + # # distributed model + # postnet_output, alignments = model.module.inference( + # inputs, + # inputs_lengths, + # g=speaker_id if speaker_id is not None else speaker_embeddings) + # else: + # postnet_output, alignments = model.inference( + # inputs, + # inputs_lengths, + # g=speaker_id if speaker_id is not None else speaker_embeddings) + # postnet_output = postnet_output.permute(0, 2, 1) + # # these only belong to tacotron models. + # decoder_output = None + # stop_tokens = None + # else: + # raise ValueError("[!] Unknown model name.") + return outputs -def run_model_tf(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None): +def run_model_tf(model, inputs, CONFIG, speaker_id=None, style_mel=None): if CONFIG.gst and style_mel is not None: raise NotImplementedError(" [!] GST inference not implemented for TF") - if truncated: - raise NotImplementedError(" [!] Truncated inference not implemented for TF") if speaker_id is not None: raise NotImplementedError(" [!] Multi-Speaker not implemented for TF") # TODO: handle multispeaker case - decoder_output, postnet_output, alignments, stop_tokens = model(inputs, training=False) + decoder_output, postnet_output, alignments, stop_tokens = model( + inputs, training=False) return decoder_output, postnet_output, alignments, stop_tokens -def run_model_tflite(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None): +def run_model_tflite(model, inputs, CONFIG, speaker_id=None, style_mel=None): if CONFIG.gst and style_mel is not None: - raise NotImplementedError(" [!] GST inference not implemented for TfLite") - if truncated: - raise NotImplementedError(" [!] Truncated inference not implemented for TfLite") + raise NotImplementedError( + " [!] GST inference not implemented for TfLite") if speaker_id is not None: - raise NotImplementedError(" [!] Multi-Speaker not implemented for TfLite") + raise NotImplementedError( + " [!] Multi-Speaker not implemented for TfLite") # get input and output details input_details = model.get_input_details() output_details = model.get_output_details() @@ -151,9 +152,11 @@ def run_model_tflite(model, inputs, CONFIG, truncated, speaker_id=None, style_me return decoder_output, postnet_output, None, None -def parse_outputs_torch(postnet_output, decoder_output, alignments, stop_tokens): +def parse_outputs_torch(postnet_output, decoder_output, alignments, + stop_tokens): postnet_output = postnet_output[0].data.cpu().numpy() - decoder_output = None if decoder_output is None else decoder_output[0].data.cpu().numpy() + decoder_output = None if decoder_output is None else decoder_output[ + 0].data.cpu().numpy() alignment = alignments[0].cpu().data.numpy() stop_tokens = None if stop_tokens is None else stop_tokens[0].cpu().numpy() return postnet_output, decoder_output, alignment, stop_tokens @@ -174,7 +177,7 @@ def parse_outputs_tflite(postnet_output, decoder_output): def trim_silence(wav, ap): - return wav[: ap.find_endpoint(wav)] + return wav[:ap.find_endpoint(wav)] def inv_spectrogram(postnet_output, ap, CONFIG): @@ -185,23 +188,23 @@ def inv_spectrogram(postnet_output, ap, CONFIG): return wav -def id_to_torch(speaker_id, cuda=False): +def speaker_id_to_torch(speaker_id, cuda=False): if speaker_id is not None: speaker_id = np.asarray(speaker_id) - # TODO: test this for tacotron models speaker_id = torch.from_numpy(speaker_id) if cuda: return speaker_id.cuda() return speaker_id -def embedding_to_torch(speaker_embedding, cuda=False): - if speaker_embedding is not None: - speaker_embedding = np.asarray(speaker_embedding) - speaker_embedding = torch.from_numpy(speaker_embedding).unsqueeze(0).type(torch.FloatTensor) +def embedding_to_torch(x_vector, cuda=False): + if x_vector is not None: + x_vector = np.asarray(x_vector) + x_vector = torch.from_numpy(x_vector).unsqueeze( + 0).type(torch.FloatTensor) if cuda: - return speaker_embedding.cuda() - return speaker_embedding + return x_vector.cuda() + return x_vector # TODO: perform GL with pytorch for batching @@ -215,7 +218,8 @@ def apply_griffin_lim(inputs, input_lens, CONFIG, ap): """ wavs = [] for idx, spec in enumerate(inputs): - wav_len = (input_lens[idx] * ap.hop_length) - ap.hop_length # inverse librosa padding + wav_len = (input_lens[idx] * + ap.hop_length) - ap.hop_length # inverse librosa padding wav = inv_spectrogram(spec, ap, CONFIG) # assert len(wav) == wav_len, f" [!] wav lenght: {len(wav)} vs expected: {wav_len}" wavs.append(wav[:wav_len]) @@ -230,11 +234,10 @@ def synthesis( ap, speaker_id=None, style_wav=None, - truncated=False, enable_eos_bos_chars=False, # pylint: disable=unused-argument use_griffin_lim=False, do_trim_silence=False, - speaker_embedding=None, + x_vector=None, backend="torch", ): """Synthesize voice for the given text. @@ -248,8 +251,6 @@ def synthesis( model outputs. speaker_id (int): id of speaker style_wav (str | Dict[str, float]): Uses for style embedding of GST. - truncated (bool): keep model states after inference. It can be used - for continuous inference at long texts. enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence. do_trim_silence (bool): trim silence after synthesis. backend (str): tf or torch @@ -262,14 +263,15 @@ def synthesis( else: style_mel = compute_style_mel(style_wav, ap, cuda=use_cuda) # preprocess the given text - inputs = text_to_seqvec(text, CONFIG) + inputs = text_to_seq(text, CONFIG) # pass tensors to backend if backend == "torch": if speaker_id is not None: - speaker_id = id_to_torch(speaker_id, cuda=use_cuda) + speaker_id = speaker_id_to_torch(speaker_id, cuda=use_cuda) - if speaker_embedding is not None: - speaker_embedding = embedding_to_torch(speaker_embedding, cuda=use_cuda) + if x_vector is not None: + x_vector = embedding_to_torch(x_vector, + cuda=use_cuda) if not isinstance(style_mel, dict): style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) @@ -286,24 +288,26 @@ def synthesis( inputs = tf.expand_dims(inputs, 0) # synthesize voice if backend == "torch": - decoder_output, postnet_output, alignments, stop_tokens = run_model_torch( - model, inputs, CONFIG, truncated, speaker_id, style_mel, speaker_embeddings=speaker_embedding - ) + outputs = run_model_torch(model, + inputs, + speaker_id, + style_mel, + x_vector=x_vector) + postnet_output, decoder_output, alignments, stop_tokens = \ + outputs['postnet_outputs'], outputs['decoder_outputs'],\ + outputs['alignments'], outputs['stop_tokens'] postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_torch( - postnet_output, decoder_output, alignments, stop_tokens - ) + postnet_output, decoder_output, alignments, stop_tokens) elif backend == "tf": decoder_output, postnet_output, alignments, stop_tokens = run_model_tf( - model, inputs, CONFIG, truncated, speaker_id, style_mel - ) + model, inputs, CONFIG, speaker_id, style_mel) postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_tf( - postnet_output, decoder_output, alignments, stop_tokens - ) + postnet_output, decoder_output, alignments, stop_tokens) elif backend == "tflite": decoder_output, postnet_output, alignment, stop_tokens = run_model_tflite( - model, inputs, CONFIG, truncated, speaker_id, style_mel - ) - postnet_output, decoder_output = parse_outputs_tflite(postnet_output, decoder_output) + model, inputs, CONFIG, speaker_id, style_mel) + postnet_output, decoder_output = parse_outputs_tflite( + postnet_output, decoder_output) # convert outputs to numpy # plot results wav = None From 570633ab8056c142227c635cfa43434d0ce93639 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 14:41:37 +0200 Subject: [PATCH 015/258] update console logger --- TTS/utils/console_logger.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/TTS/utils/console_logger.py b/TTS/utils/console_logger.py index 7d6e1968..bb6644c9 100644 --- a/TTS/utils/console_logger.py +++ b/TTS/utils/console_logger.py @@ -68,11 +68,10 @@ class ConsoleLogger: print(log_text, flush=True) def print_eval_start(self): - print(f"{tcolors.BOLD} > EVALUATION {tcolors.ENDC}\n") + print(f"\n{tcolors.BOLD} > EVALUATION {tcolors.ENDC}\n") def print_eval_step(self, step, loss_dict, avg_loss_dict): indent = " | > " - print() log_text = f"{tcolors.BOLD} --> STEP: {step}{tcolors.ENDC}\n" for key, value in loss_dict.items(): # print the avg value if given @@ -84,7 +83,7 @@ class ConsoleLogger: def print_epoch_end(self, epoch, avg_loss_dict): indent = " | > " - log_text = " {}--> EVAL PERFORMANCE{}\n".format(tcolors.BOLD, tcolors.ENDC) + log_text = "\n {}--> EVAL PERFORMANCE{}\n".format(tcolors.BOLD, tcolors.ENDC) for key, value in avg_loss_dict.items(): # print the avg value if given color = "" From d769af9e3b70860a8fe1043d95f7df994ed1f662 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 14:41:56 +0200 Subject: [PATCH 016/258] remove `truncated` from synthesizer --- TTS/utils/synthesizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index bca3df31..5962950f 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -230,7 +230,6 @@ class Synthesizer(object): ap=self.ap, speaker_id=None, style_wav=style_wav, - truncated=False, enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars, use_griffin_lim=use_gl, speaker_embedding=speaker_embedding, From 57cdddef160082374a7ce1bf66f58606a76e32b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 14:45:27 +0200 Subject: [PATCH 017/258] add trainer and train_tts --- TTS/bin/train_tts.py | 28 ++ TTS/trainer.py | 756 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 784 insertions(+) create mode 100644 TTS/bin/train_tts.py create mode 100644 TTS/trainer.py diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py new file mode 100644 index 00000000..5058d341 --- /dev/null +++ b/TTS/bin/train_tts.py @@ -0,0 +1,28 @@ +import os +import sys +import traceback +from TTS.utils.arguments import init_training +from TTS.utils.generic_utils import remove_experiment_folder +from TTS.trainer import TrainerTTS + + +def main(): + # try: + args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training( + sys.argv) + trainer = TrainerTTS(args, config, c_logger, tb_logger, output_path=OUT_PATH) + trainer.fit() + # except KeyboardInterrupt: + # remove_experiment_folder(OUT_PATH) + # try: + # sys.exit(0) + # except SystemExit: + # os._exit(0) # pylint: disable=protected-access + # except Exception: # pylint: disable=broad-except + # remove_experiment_folder(OUT_PATH) + # traceback.print_exc() + # sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/TTS/trainer.py b/TTS/trainer.py new file mode 100644 index 00000000..cfb72191 --- /dev/null +++ b/TTS/trainer.py @@ -0,0 +1,756 @@ +# -*- coding: utf-8 -*- + +import os +import sys +import time +import traceback +from random import randrange +import logging +import importlib + +import numpy as np +import torch + +# DISTRIBUTED +from torch.nn.parallel import DistributedDataParallel as DDP_th +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler + +from TTS.tts.datasets import load_meta_data, TTSDataset +from TTS.tts.layers import setup_loss +from TTS.tts.models import setup_model +from TTS.tts.utils.io import save_best_model, save_checkpoint +from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.synthesis import synthesis +from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols +from TTS.utils.arguments import init_training +from TTS.tts.utils.visual import plot_spectrogram, plot_alignment +from TTS.utils.audio import AudioProcessor +from TTS.utils.distribute import init_distributed, reduce_tensor +from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict, find_module +from TTS.utils.training import setup_torch_training_env, check_update + + +@dataclass +class TrainingArgs(Coqpit): + continue_path: str = field( + default='', + metadata={ + 'help': + 'Path to a training folder to continue training. Restore the model from the last checkpoint and continue training under the same folder.' + }) + restore_path: str = field( + default='', + metadata={ + 'help': + 'Path to a model checkpoit. Restore the model with the given checkpoint and start a new training.' + }) + best_path: str = field( + default='', + metadata={ + 'help': + "Best model file to be used for extracting best loss. If not specified, the latest best model in continue path is used" + }) + config_path: str = field( + default='', metadata={'help': 'Path to the configuration file.'}) + rank: int = field( + default=0, metadata={'help': 'Process rank in distributed training.'}) + group_id: str = field( + default='', + metadata={'help': 'Process group id in distributed training.'}) + + +# pylint: disable=import-outside-toplevel, too-many-public-methods +class TrainerTTS: + use_cuda, num_gpus = setup_torch_training_env(True, False) + + def __init__(self, + args, + config, + c_logger, + tb_logger, + model=None, + output_path=None): + self.args = args + self.config = config + self.c_logger = c_logger + self.tb_logger = tb_logger + self.output_path = output_path + + self.total_steps_done = 0 + self.epochs_done = 0 + self.restore_step = 0 + self.best_loss = float("inf") + self.train_loader = None + self.eval_loader = None + self.output_audio_path = os.path.join(output_path, 'test_audios') + + self.keep_avg_train = None + self.keep_avg_eval = None + + # model, audio processor, datasets, loss + # init audio processor + self.ap = AudioProcessor(**config.audio.to_dict()) + + # init character processor + self.model_characters = self.init_character_processor() + + # load dataset samples + self.data_train, self.data_eval = load_meta_data(config.datasets) + + # default speaker manager + self.speaker_manager = self.init_speaker_manager() + + # init TTS model + if model is not None: + self.model = model + else: + self.model = self.init_model() + + # setup criterion + self.criterion = self.init_criterion() + + # DISTRUBUTED + if self.num_gpus > 1: + init_distributed(args.rank, self.num_gpus, args.group_id, + config.distributed["backend"], + config.distributed["url"]) + + # scalers for mixed precision training + self.scaler = torch.cuda.amp.GradScaler( + ) if config.mixed_precision else None + + # setup optimizer + self.optimizer = self.init_optimizer(self.model) + + # setup scheduler + self.scheduler = self.init_scheduler(self.config, self.optimizer) + + if self.args.restore_path: + self.model, self.optimizer, self.scaler, self.restore_step = self.restore_model( + self.config, args.restore_path, self.model, self.optimizer, + self.scaler) + + if self.use_cuda: + self.model.cuda() + self.criterion.cuda() + + # DISTRUBUTED + if self.num_gpus > 1: + self.model = DDP_th(self.model, device_ids=[args.rank]) + + # count model size + num_params = count_parameters(self.model) + logging.info("\n > Model has {} parameters".format(num_params), + flush=True) + + def init_model(self): + model = setup_model( + len(self.model_characters), + self.speaker_manager.num_speakers, + self.config, + self.speaker_manager.x_vector_dim + if self.speaker_manager.x_vectors else None, + ) + return model + + def init_optimizer(self, model): + optimizer_name = self.config.optimizer + optimizer_params = self.config.optimizer_params + if optimizer_name.lower() == "radam": + module = importlib.import_module("TTS.utils.radam") + optimizer = getattr(module, "RAdam") + else: + optimizer = getattr(torch.optim, optimizer_name) + return optimizer(model.parameters(), + lr=self.config.lr, + **optimizer_params) + + def init_character_processor(self): + # setup custom characters if set in config file. + # TODO: implement CharacterProcessor + if self.config.characters is not None: + symbols, phonemes = make_symbols( + **self.config.characters.to_dict()) + else: + from TTS.tts.utils.text.symbols import symbols, phonemes + model_characters = phonemes if self.config.use_phonemes else symbols + return model_characters + + def init_speaker_manager(self, restore_path: str = "", out_path: str = ""): + speaker_manager = SpeakerManager() + if restore_path: + speakers_file = os.path.join(os.path.dirname(restore_path), + "speaker.json") + if not os.path.exists(speakers_file): + logging.info( + "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" + ) + speakers_file = self.config.external_speaker_embedding_file + + if self.config.use_external_speaker_embedding_file: + speaker_manager.load_x_vectors_file(speakers_file) + else: + self.speaker_manage.load_speaker_mapping(speakers_file) + elif self.config.use_external_speaker_embedding_file and self.config.external_speaker_embedding_file: + speaker_manager.load_x_vectors_file( + self.config.external_speaker_embedding_file) + else: + speaker_manager.parse_speakers_from_items(self.data_train) + file_path = os.path.join(out_path, "speakers.json") + speaker_manager.save_ids_file(file_path) + return speaker_manager + + def init_scheduler(self, config, optimizer): + lr_scheduler = config.lr_scheduler + lr_scheduler_params = config.lr_scheduler_params + if lr_scheduler is None: + return None + if lr_scheduler.lower() == "noamlr": + from TTS.utils.training import NoamLR + scheduler = NoamLR + else: + scheduler = getattr(torch.optim, lr_scheduler) + return scheduler(optimizer, **lr_scheduler_params) + + def init_criterion(self): + return setup_loss(self.config) + + def restore_model(self, + config, + restore_path, + model, + optimizer, + scaler=None): + logging.info(f" > Restoring from {os.path.basename(restore_path)}...") + checkpoint = torch.load(restore_path, map_location="cpu") + try: + logging.info(" > Restoring Model...") + model.load_state_dict(checkpoint["model"]) + # optimizer restore + logging.info(" > Restoring Optimizer...") + optimizer.load_state_dict(checkpoint["optimizer"]) + if "scaler" in checkpoint and config.mixed_precision: + logging.info(" > Restoring AMP Scaler...") + scaler.load_state_dict(checkpoint["scaler"]) + except (KeyError, RuntimeError): + logging.info(" > Partial model initialization...") + model_dict = model.state_dict() + model_dict = set_init_dict(model_dict, checkpoint["model"], config) + model.load_state_dict(model_dict) + del model_dict + + for group in optimizer.param_groups: + group["lr"] = self.config.lr + logging.info(" > Model restored from step %d" % checkpoint["step"], + flush=True) + restore_step = checkpoint["step"] + return model, optimizer, scaler, restore_step + + def _setup_loader(self, r, ap, is_eval, data_items, verbose, + speaker_mapping): + if is_eval and not self.config.run_eval: + loader = None + else: + dataset = TTSDataset( + outputs_per_step=r, + text_cleaner=self.config.text_cleaner, + compute_linear_spec= 'tacotron' == self.config.model.lower(), + meta_data=data_items, + ap=ap, + tp=self.config.characters, + add_blank=self.config["add_blank"], + batch_group_size=0 if is_eval else + self.config.batch_group_size * self.config.batch_size, + min_seq_len=self.config.min_seq_len, + max_seq_len=self.config.max_seq_len, + phoneme_cache_path=self.config.phoneme_cache_path, + use_phonemes=self.config.use_phonemes, + phoneme_language=self.config.phoneme_language, + enable_eos_bos=self.config.enable_eos_bos_chars, + use_noise_augment=not is_eval, + verbose=verbose, + speaker_mapping=speaker_mapping + if self.config.use_speaker_embedding + and self.config.use_external_speaker_embedding_file else None, + ) + + if self.config.use_phonemes and self.config.compute_input_seq_cache: + # precompute phonemes to have a better estimate of sequence lengths. + dataset.compute_input_seq(self.config.num_loader_workers) + dataset.sort_items() + + sampler = DistributedSampler( + dataset) if self.num_gpus > 1 else None + loader = DataLoader( + dataset, + batch_size=self.config.eval_batch_size + if is_eval else self.config.batch_size, + shuffle=False, + collate_fn=dataset.collate_fn, + drop_last=False, + sampler=sampler, + num_workers=self.config.num_val_loader_workers + if is_eval else self.config.num_loader_workers, + pin_memory=False, + ) + return loader + + def setup_train_dataloader(self, r, ap, data_items, verbose, + speaker_mapping): + return self._setup_loader(r, ap, False, data_items, verbose, + speaker_mapping) + + def setup_eval_dataloder(self, r, ap, data_items, verbose, + speaker_mapping): + return self._setup_loader(r, ap, True, data_items, verbose, + speaker_mapping) + + def format_batch(self, batch): + # setup input batch + text_input = batch[0] + text_lengths = batch[1] + speaker_names = batch[2] + linear_input = batch[3] if self.config.model.lower() in ["tacotron" + ] else None + mel_input = batch[4] + mel_lengths = batch[5] + stop_targets = batch[6] + item_idx = batch[7] + speaker_embeddings = batch[8] + attn_mask = batch[9] + max_text_length = torch.max(text_lengths.float()) + max_spec_length = torch.max(mel_lengths.float()) + + # convert speaker names to ids + if self.config.use_speaker_embedding: + if self.config.use_external_speaker_embedding_file: + speaker_embeddings = batch[8] + speaker_ids = None + else: + speaker_ids = [ + self.speaker_manager.speaker_ids[speaker_name] + for speaker_name in speaker_names + ] + speaker_ids = torch.LongTensor(speaker_ids) + speaker_embeddings = None + else: + speaker_embeddings = None + speaker_ids = None + + # compute durations from attention masks + if attn_mask is not None: + durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2]) + for idx, am in enumerate(attn_mask): + # compute raw durations + c_idxs = am[:, :text_lengths[idx], :mel_lengths[idx]].max(1)[1] + # c_idxs, counts = torch.unique_consecutive(c_idxs, return_counts=True) + c_idxs, counts = torch.unique(c_idxs, return_counts=True) + dur = torch.ones([text_lengths[idx]]).to(counts.dtype) + dur[c_idxs] = counts + # smooth the durations and set any 0 duration to 1 + # by cutting off from the largest duration indeces. + extra_frames = dur.sum() - mel_lengths[idx] + largest_idxs = torch.argsort(-dur)[:extra_frames] + dur[largest_idxs] -= 1 + assert ( + dur.sum() == mel_lengths[idx] + ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + durations[idx, :text_lengths[idx]] = dur + + # set stop targets view, we predict a single stop token per iteration. + stop_targets = stop_targets.view(text_input.shape[0], + stop_targets.size(1) // self.config.r, + -1) + stop_targets = (stop_targets.sum(2) > + 0.0).unsqueeze(2).float().squeeze(2) + + # dispatch batch to GPU + if self.use_cuda: + text_input = text_input.cuda(non_blocking=True) + text_lengths = text_lengths.cuda(non_blocking=True) + mel_input = mel_input.cuda(non_blocking=True) + mel_lengths = mel_lengths.cuda(non_blocking=True) + linear_input = linear_input.cuda( + non_blocking=True) if self.config.model.lower() in [ + "tacotron" + ] else None + stop_targets = stop_targets.cuda(non_blocking=True) + attn_mask = attn_mask.cuda( + non_blocking=True) if attn_mask is not None else None + durations = durations.cuda( + non_blocking=True) if attn_mask is not None else None + if speaker_ids is not None: + speaker_ids = speaker_ids.cuda(non_blocking=True) + if speaker_embeddings is not None: + speaker_embeddings = speaker_embeddings.cuda(non_blocking=True) + + return { + "text_input": text_input, + "text_lengths": text_lengths, + "mel_input": mel_input, + "mel_lengths": mel_lengths, + "linear_input": linear_input, + "stop_targets": stop_targets, + "attn_mask": attn_mask, + "durations": durations, + "speaker_ids": speaker_ids, + "x_vectors": speaker_embeddings, + "max_text_length": max_text_length, + "max_spec_length": max_spec_length, + "item_idx": item_idx + } + + def train_step(self, batch, batch_n_steps, step, loader_start_time): + self.on_train_step_start() + step_start_time = time.time() + + # format data + batch = self.format_batch(batch) + loader_time = time.time() - loader_start_time + + # zero-out optimizer + self.optimizer.zero_grad() + + with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): + outputs, loss_dict = self.model.train_step(batch, self.criterion) + + # check nan loss + if torch.isnan(loss_dict["loss"]).any(): + raise RuntimeError( + f"Detected NaN loss at step {self.total_steps_done}.") + + # optimizer step + if self.config.mixed_precision: + # model optimizer step in mixed precision mode + self.scaler.scale(loss_dict["loss"]).backward() + self.scaler.unscale_(self.optimizer) + grad_norm, _ = check_update(self.model, + self.config.grad_clip, + ignore_stopnet=True) + self.scaler.step(self.optimizer) + self.scaler.update() + else: + # main model optimizer step + loss_dict["loss"].backward() + grad_norm, _ = check_update(self.model, + self.config.grad_clip, + ignore_stopnet=True) + self.optimizer.step() + + step_time = time.time() - step_start_time + + # setup lr + if self.config.lr_scheduler: + self.scheduler.step() + + # detach loss values + loss_dict_new = dict() + for key, value in loss_dict.items(): + if isinstance(value, (int, float)): + loss_dict_new[key] = value + else: + loss_dict_new[key] = value.item() + loss_dict = loss_dict_new + + # update avg stats + update_train_values = dict() + for key, value in loss_dict.items(): + update_train_values["avg_" + key] = value + update_train_values["avg_loader_time"] = loader_time + update_train_values["avg_step_time"] = step_time + self.keep_avg_train.update_values(update_train_values) + + # print training progress + current_lr = self.optimizer.param_groups[0]["lr"] + if self.total_steps_done % self.config.print_step == 0: + log_dict = { + "max_spec_length": [batch["max_spec_length"], + 1], # value, precision + "max_text_length": [batch["max_text_length"], 1], + "step_time": [step_time, 4], + "loader_time": [loader_time, 2], + "current_lr": current_lr, + } + self.c_logger.print_train_step(batch_n_steps, step, + self.total_steps_done, log_dict, + loss_dict, + self.keep_avg_train.avg_values) + + if self.args.rank == 0: + # Plot Training Iter Stats + # reduce TB load + if self.total_steps_done % self.config.tb_plot_step == 0: + iter_stats = { + "lr": current_lr, + "grad_norm": grad_norm, + "step_time": step_time, + } + iter_stats.update(loss_dict) + self.tb_logger.tb_train_step_stats(self.total_steps_done, + iter_stats) + + if self.total_steps_done % self.config.save_step == 0: + if self.config.checkpoint: + # save model + save_checkpoint( + self.model, + self.optimizer, + self.total_steps_done, + self.epochs_done, + self.config.r, + self.output_path, + model_loss=loss_dict["loss"], + characters=self.model_characters, + scaler=self.scaler.state_dict() + if self.config.mixed_precision else None, + ) + # training visualizations + figures, audios = self.model.train_log(self.ap, batch, outputs) + self.tb_logger.tb_train_figures(self.total_steps_done, figures) + self.tb_logger.tb_train_audios(self.total_steps_done, + {"TrainAudio": audios}, + self.ap.sample_rate) + self.total_steps_done += 1 + self.on_train_step_end() + return outputs, loss_dict + + def train_epoch(self): + self.model.train() + epoch_start_time = time.time() + if self.use_cuda: + batch_num_steps = int( + len(self.train_loader.dataset) / + (self.config.batch_size * self.num_gpus)) + else: + batch_num_steps = int( + len(self.train_loader.dataset) / self.config.batch_size) + self.c_logger.print_train_start() + loader_start_time = time.time() + for cur_step, batch in enumerate(self.train_loader): + _, _ = self.train_step(batch, batch_num_steps, cur_step, + loader_start_time) + epoch_time = time.time() - epoch_start_time + # Plot self.epochs_done Stats + if self.args.rank == 0: + epoch_stats = {"epoch_time": epoch_time} + epoch_stats.update(self.keep_avg_train.avg_values) + self.tb_logger.tb_train_epoch_stats(self.total_steps_done, + epoch_stats) + if self.config.tb_model_param_stats: + self.tb_logger.tb_model_weights(self.model, + self.total_steps_done) + + def eval_step(self, batch, step): + with torch.no_grad(): + step_start_time = time.time() + + with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): + outputs, loss_dict = self.model.eval_step( + batch, self.criterion) + + step_time = time.time() - step_start_time + + # detach loss values + loss_dict_new = dict() + for key, value in loss_dict.items(): + if isinstance(value, (int, float)): + loss_dict_new[key] = value + else: + loss_dict_new[key] = value.item() + loss_dict = loss_dict_new + + # update avg stats + update_eval_values = dict() + for key, value in loss_dict.items(): + update_eval_values["avg_" + key] = value + update_eval_values["avg_step_time"] = step_time + self.keep_avg_eval.update_values(update_eval_values) + + if self.config.print_eval: + self.c_logger.print_eval_step(step, loss_dict, + self.keep_avg_eval.avg_values) + return outputs, loss_dict + + def eval_epoch(self): + self.model.eval() + if self.use_cuda: + batch_num_steps = int( + len(self.train_loader.dataset) / + (self.config.batch_size * self.num_gpus)) + else: + batch_num_steps = int( + len(self.train_loader.dataset) / self.config.batch_size) + self.c_logger.print_eval_start() + loader_start_time = time.time() + for cur_step, batch in enumerate(self.eval_loader): + # format data + batch = self.format_batch(batch) + loader_time = time.time() - loader_start_time + self.keep_avg_eval.update_values({'avg_loader_time': loader_time}) + outputs, _ = self.eval_step(batch, cur_step) + # Plot epoch stats and samples from the last batch. + if self.args.rank == 0: + figures, eval_audios = self.model.eval_log(self.ap, batch, outputs) + self.tb_logger.tb_eval_figures(self.total_steps_done, figures) + self.tb_logger.tb_eval_audios(self.total_steps_done, + {"EvalAudio": eval_audios}, + self.ap.sample_rate) + + def test_run(self, ): + logging.info(" | > Synthesizing test sentences.") + test_audios = {} + test_figures = {} + test_sentences = self.config.test_sentences + cond_inputs = self._get_cond_inputs() + for idx, sen in enumerate(test_sentences): + wav, alignment, model_outputs, _ = synthesis( + self.model, + sen, + self.config, + self.use_cuda, + self.ap, + speaker_id=cond_inputs['speaker_id'], + x_vector=cond_inputs['x_vector'], + style_wav=cond_inputs['style_wav'], + enable_eos_bos_chars=self.config.enable_eos_bos_chars, + use_griffin_lim=True, + do_trim_silence=False, + ).values() + + file_path = os.path.join(self.output_audio_path, str(self.total_steps_done)) + os.makedirs(file_path, exist_ok=True) + file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) + self.ap.save_wav(wav, file_path) + test_audios["{}-audio".format(idx)] = wav + test_figures["{}-prediction".format(idx)] = plot_spectrogram( + model_outputs, self.ap, output_fig=False) + test_figures["{}-alignment".format(idx)] = plot_alignment( + alignment, output_fig=False) + + self.tb_logger.tb_test_audios(self.total_steps_done, test_audios, + self.config.audio["sample_rate"]) + self.tb_logger.tb_test_figures(self.total_steps_done, test_figures) + + def _get_cond_inputs(self): + # setup speaker_id + speaker_id = 0 if self.config.use_speaker_embedding else None + # setup x_vector + x_vector = self.speaker_manager.get_x_vectors_by_speaker( + self.speaker_manager.speaker_ids[0] + ) if self.config.use_external_speaker_embedding_file and self.config.use_speaker_embedding else None + # setup style_mel + if self.config.has('gst_style_input'): + style_wav = self.config.gst_style_input + else: + style_wav = None + if style_wav is None and 'use_gst' in self.config and self.config.use_gst: + # inicialize GST with zero dict. + style_wav = {} + print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") + for i in range(self.config.gst["gst_num_style_tokens"]): + style_wav[str(i)] = 0 + cond_inputs = {'speaker_id': speaker_id, 'style_wav': style_wav, 'x_vector': x_vector} + return cond_inputs + + def fit(self): + if self.restore_step != 0 or self.args.best_path: + logging.info(" > Restoring best loss from " + f"{os.path.basename(self.args.best_path)} ...") + self.best_loss = torch.load(self.args.best_path, + map_location="cpu")["model_loss"] + logging.info( + f" > Starting with loaded last best loss {self.best_loss}.") + + # define data loaders + self.train_loader = self.setup_train_dataloader( + self.config.r, + self.ap, + self.data_train, + verbose=True, + speaker_mapping=self.speaker_manager.speaker_ids) + self.eval_loader = self.setup_eval_dataloder( + self.config.r, + self.ap, + self.data_train, + verbose=True, + speaker_mapping=self.speaker_manager.speaker_ids + ) if self.config.run_eval else None + + self.total_steps_done = self.restore_step + + for epoch in range(0, self.config.epochs): + self.on_epoch_start() + self.keep_avg_train = KeepAverage() + self.keep_avg_eval = KeepAverage( + ) if self.config.run_eval else None + self.epochs_done = epoch + self.c_logger.print_epoch_start(epoch, self.config.epochs) + self.train_epoch() + if self.config.run_eval: + self.eval_epoch() + if epoch >= self.config.test_delay_epochs: + self.test_run() + self.c_logger.print_epoch_end( + epoch, self.keep_avg_eval.avg_values + if self.config.run_eval else self.keep_avg_train.avg_values) + self.save_best_model() + self.on_epoch_end() + + def save_best_model(self): + self.best_loss = save_best_model( + self.keep_avg_eval['avg_loss'] + if self.keep_avg_eval else self.keep_avg_train['avg_loss'], + self.best_loss, + self.model, + self.optimizer, + self.total_steps_done, + self.epochs_done, + self.config.r, + self.output_path, + self.model_characters, + keep_all_best=self.config.keep_all_best, + keep_after=self.config.keep_after, + scaler=self.scaler.state_dict() + if self.config.mixed_precision else None, + ) + + def on_epoch_start(self): + if hasattr(self.model, 'on_epoch_start'): + self.model.on_epoch_start(self) + + if hasattr(self.criterion, "on_epoch_start"): + self.criterion.on_epoch_start(self) + + if hasattr(self.optimizer, "on_epoch_start"): + self.optimizer.on_epoch_start(self) + + def on_epoch_end(self): + if hasattr(self.model, "on_epoch_start"): + self.model.on_epoch_end(self) + + if hasattr(self.criterion, "on_epoch_end"): + self.criterion.on_epoch_end(self) + + if hasattr(self.optimizer, "on_epoch_end"): + self.optimizer.on_epoch_end(self) + + def on_train_step_start(self): + if hasattr(self.model, "on_epoch_start"): + self.model.on_train_step_start(self) + + if hasattr(self.criterion, "on_train_step_start"): + self.criterion.on_train_step_start(self) + + if hasattr(self.optimizer, "on_train_step_start"): + self.optimizer.on_train_step_start(self) + + def on_train_step_end(self): + if hasattr(self.model, "on_train_step_end"): + self.model.on_train_step_end(self) + + if hasattr(self.criterion, "on_train_step_end"): + self.criterion.on_train_step_end(self) + + if hasattr(self.optimizer, "on_train_step_end"): + self.optimizer.on_train_step_end(self) From 6bf6543df837d342bd1445a9a9bb917c8d90377b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 May 2021 13:33:58 +0200 Subject: [PATCH 018/258] typing annotation for the trainer --- TTS/trainer.py | 275 +++++++++++++++++++++++++++---------------------- 1 file changed, 151 insertions(+), 124 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index cfb72191..3beb281f 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -65,12 +65,12 @@ class TrainerTTS: use_cuda, num_gpus = setup_torch_training_env(True, False) def __init__(self, - args, - config, - c_logger, - tb_logger, - model=None, - output_path=None): + args: Union[Coqpit, Namespace], + config: Coqpit, + c_logger: ConsoleLogger, + tb_logger: TensorboardLogger, + model: nn.Module = None, + output_path: str = None) -> None: self.args = args self.config = config self.c_logger = c_logger @@ -88,43 +88,52 @@ class TrainerTTS: self.keep_avg_train = None self.keep_avg_eval = None + log_file = os.path.join(self.output_path, + f"trainer_{args.rank}_log.txt") + self._setup_logger_config(log_file) + # model, audio processor, datasets, loss # init audio processor - self.ap = AudioProcessor(**config.audio.to_dict()) + self.ap = AudioProcessor(**self.config.audio.to_dict()) # init character processor - self.model_characters = self.init_character_processor() + self.model_characters = self.get_character_processor(self.config) # load dataset samples - self.data_train, self.data_eval = load_meta_data(config.datasets) + self.data_train, self.data_eval = load_meta_data(self.config.datasets) # default speaker manager - self.speaker_manager = self.init_speaker_manager() + self.speaker_manager = self.get_speaker_manager( + self.config, args.restore_path, self.config.output_path, self.data_train) # init TTS model if model is not None: self.model = model else: - self.model = self.init_model() + self.model = self.get_model( + len(self.model_characters), self.speaker_manager.num_speakers, + self.config, self.speaker_manager.x_vector_dim + if self.speaker_manager.x_vectors else None) # setup criterion - self.criterion = self.init_criterion() + self.criterion = self.get_criterion(self.config) + + if self.use_cuda: + self.model.cuda() + self.criterion.cuda() # DISTRUBUTED if self.num_gpus > 1: init_distributed(args.rank, self.num_gpus, args.group_id, - config.distributed["backend"], - config.distributed["url"]) + self.config.distributed["backend"], + self.config.distributed["url"]) # scalers for mixed precision training self.scaler = torch.cuda.amp.GradScaler( - ) if config.mixed_precision else None + ) if self.config.mixed_precision and self.use_cuda else None # setup optimizer - self.optimizer = self.init_optimizer(self.model) - - # setup scheduler - self.scheduler = self.init_scheduler(self.config, self.optimizer) + self.optimizer = self.get_optimizer(self.model, self.config) if self.args.restore_path: self.model, self.optimizer, self.scaler, self.restore_step = self.restore_model( @@ -144,64 +153,66 @@ class TrainerTTS: logging.info("\n > Model has {} parameters".format(num_params), flush=True) - def init_model(self): - model = setup_model( - len(self.model_characters), - self.speaker_manager.num_speakers, - self.config, - self.speaker_manager.x_vector_dim - if self.speaker_manager.x_vectors else None, - ) + @staticmethod + def get_model(num_chars: int, num_speakers: int, config: Coqpit, + x_vector_dim: int) -> nn.Module: + model = setup_model(num_chars, num_speakers, config, x_vector_dim) return model - def init_optimizer(self, model): - optimizer_name = self.config.optimizer - optimizer_params = self.config.optimizer_params + @staticmethod + def get_optimizer(model: nn.Module, config: Coqpit) -> torch.optim.Optimizer: + optimizer_name = config.optimizer + optimizer_params = config.optimizer_params if optimizer_name.lower() == "radam": module = importlib.import_module("TTS.utils.radam") optimizer = getattr(module, "RAdam") else: optimizer = getattr(torch.optim, optimizer_name) - return optimizer(model.parameters(), - lr=self.config.lr, - **optimizer_params) + return optimizer(model.parameters(), lr=config.lr, **optimizer_params) - def init_character_processor(self): + @staticmethod + def get_character_processor(config: Coqpit) -> str: # setup custom characters if set in config file. # TODO: implement CharacterProcessor - if self.config.characters is not None: - symbols, phonemes = make_symbols( - **self.config.characters.to_dict()) + if config.characters is not None: + symbols, phonemes = make_symbols(**config.characters.to_dict()) else: - from TTS.tts.utils.text.symbols import symbols, phonemes - model_characters = phonemes if self.config.use_phonemes else symbols + from TTS.tts.utils.text.symbols import phonemes, symbols + model_characters = phonemes if config.use_phonemes else symbols return model_characters - def init_speaker_manager(self, restore_path: str = "", out_path: str = ""): + @staticmethod + def get_speaker_manager(config: Coqpit, + restore_path: str = "", + out_path: str = "", + data_train: List = []) -> SpeakerManager: speaker_manager = SpeakerManager() - if restore_path: - speakers_file = os.path.join(os.path.dirname(restore_path), - "speaker.json") - if not os.path.exists(speakers_file): - logging.info( - "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" - ) - speakers_file = self.config.external_speaker_embedding_file + if config.use_speaker_embedding: + if restore_path: + speakers_file = os.path.join(os.path.dirname(restore_path), + "speaker.json") + if not os.path.exists(speakers_file): + print( + "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" + ) + speakers_file = config.external_speaker_embedding_file - if self.config.use_external_speaker_embedding_file: - speaker_manager.load_x_vectors_file(speakers_file) + if config.use_external_speaker_embedding_file: + speaker_manager.load_x_vectors_file(speakers_file) + else: + speaker_manager.load_ids_file(speakers_file) + elif config.use_external_speaker_embedding_file and config.external_speaker_embedding_file: + speaker_manager.load_x_vectors_file( + config.external_speaker_embedding_file) else: - self.speaker_manage.load_speaker_mapping(speakers_file) - elif self.config.use_external_speaker_embedding_file and self.config.external_speaker_embedding_file: - speaker_manager.load_x_vectors_file( - self.config.external_speaker_embedding_file) - else: - speaker_manager.parse_speakers_from_items(self.data_train) - file_path = os.path.join(out_path, "speakers.json") - speaker_manager.save_ids_file(file_path) + speaker_manager.parse_speakers_from_items(data_train) + file_path = os.path.join(out_path, "speakers.json") + speaker_manager.save_ids_file(file_path) return speaker_manager - def init_scheduler(self, config, optimizer): + @staticmethod + def get_scheduler(config: Coqpit, + optimizer: torch.optim.Optimizer) -> torch.optim.lr_scheduler._LRScheduler: lr_scheduler = config.lr_scheduler lr_scheduler_params = config.lr_scheduler_params if lr_scheduler is None: @@ -213,17 +224,20 @@ class TrainerTTS: scheduler = getattr(torch.optim, lr_scheduler) return scheduler(optimizer, **lr_scheduler_params) - def init_criterion(self): - return setup_loss(self.config) + @staticmethod + def get_criterion(config: Coqpit) -> nn.Module: + return setup_loss(config) - def restore_model(self, - config, - restore_path, - model, - optimizer, - scaler=None): - logging.info(f" > Restoring from {os.path.basename(restore_path)}...") - checkpoint = torch.load(restore_path, map_location="cpu") + def restore_model( + self, + config: Coqpit, + restore_path: str, + model: nn.Module, + optimizer: torch.optim.Optimizer, + scaler: torch.cuda.amp.GradScaler = None + ) -> Tuple[nn.Module, torch.optim.Optimizer, torch.cuda.amp.GradScaler, int]: + print(" > Restoring from %s ..." % os.path.basename(restore_path)) + checkpoint = torch.load(restore_path) try: logging.info(" > Restoring Model...") model.load_state_dict(checkpoint["model"]) @@ -242,20 +256,20 @@ class TrainerTTS: for group in optimizer.param_groups: group["lr"] = self.config.lr - logging.info(" > Model restored from step %d" % checkpoint["step"], - flush=True) + print(" > Model restored from step %d" % checkpoint["step"], ) restore_step = checkpoint["step"] return model, optimizer, scaler, restore_step - def _setup_loader(self, r, ap, is_eval, data_items, verbose, - speaker_mapping): + def _get_loader(self, r: int, ap: AudioProcessor, is_eval: bool, + data_items: List, verbose: bool, + speaker_mapping: Union[Dict, List]) -> DataLoader: if is_eval and not self.config.run_eval: loader = None else: dataset = TTSDataset( outputs_per_step=r, text_cleaner=self.config.text_cleaner, - compute_linear_spec= 'tacotron' == self.config.model.lower(), + compute_linear_spec=self.config.model.lower() == "tacotron", meta_data=data_items, ap=ap, tp=self.config.characters, @@ -296,17 +310,19 @@ class TrainerTTS: ) return loader - def setup_train_dataloader(self, r, ap, data_items, verbose, - speaker_mapping): - return self._setup_loader(r, ap, False, data_items, verbose, - speaker_mapping) + def get_train_dataloader(self, r: int, ap: AudioProcessor, + data_items: List, verbose: bool, + speaker_mapping: Union[List, Dict]) -> DataLoader: + return self._get_loader(r, ap, False, data_items, verbose, + speaker_mapping) - def setup_eval_dataloder(self, r, ap, data_items, verbose, - speaker_mapping): - return self._setup_loader(r, ap, True, data_items, verbose, - speaker_mapping) + def get_eval_dataloder(self, r: int, ap: AudioProcessor, data_items: List, + verbose: bool, + speaker_mapping: Union[List, Dict]) -> DataLoader: + return self._get_loader(r, ap, True, data_items, verbose, + speaker_mapping) - def format_batch(self, batch): + def format_batch(self, batch: List) -> Dict: # setup input batch text_input = batch[0] text_lengths = batch[1] @@ -401,7 +417,8 @@ class TrainerTTS: "item_idx": item_idx } - def train_step(self, batch, batch_n_steps, step, loader_start_time): + def train_step(self, batch: Dict, batch_n_steps: int, step: int, + loader_start_time: float) -> Tuple[Dict, Dict]: self.on_train_step_start() step_start_time = time.time() @@ -515,7 +532,7 @@ class TrainerTTS: self.on_train_step_end() return outputs, loss_dict - def train_epoch(self): + def train_epoch(self) -> None: self.model.train() epoch_start_time = time.time() if self.use_cuda: @@ -541,7 +558,7 @@ class TrainerTTS: self.tb_logger.tb_model_weights(self.model, self.total_steps_done) - def eval_step(self, batch, step): + def eval_step(self, batch: Dict, step: int) -> Tuple[Dict, Dict]: with torch.no_grad(): step_start_time = time.time() @@ -572,17 +589,11 @@ class TrainerTTS: self.keep_avg_eval.avg_values) return outputs, loss_dict - def eval_epoch(self): + def eval_epoch(self) -> None: self.model.eval() - if self.use_cuda: - batch_num_steps = int( - len(self.train_loader.dataset) / - (self.config.batch_size * self.num_gpus)) - else: - batch_num_steps = int( - len(self.train_loader.dataset) / self.config.batch_size) self.c_logger.print_eval_start() loader_start_time = time.time() + batch = None for cur_step, batch in enumerate(self.eval_loader): # format data batch = self.format_batch(batch) @@ -597,8 +608,8 @@ class TrainerTTS: {"EvalAudio": eval_audios}, self.ap.sample_rate) - def test_run(self, ): - logging.info(" | > Synthesizing test sentences.") + def test_run(self, ) -> None: + print(" | > Synthesizing test sentences.") test_audios = {} test_figures = {} test_sentences = self.config.test_sentences @@ -618,9 +629,11 @@ class TrainerTTS: do_trim_silence=False, ).values() - file_path = os.path.join(self.output_audio_path, str(self.total_steps_done)) + file_path = os.path.join(self.output_audio_path, + str(self.total_steps_done)) os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) + file_path = os.path.join(file_path, + "TestSentence_{}.wav".format(idx)) self.ap.save_wav(wav, file_path) test_audios["{}-audio".format(idx)] = wav test_figures["{}-prediction".format(idx)] = plot_spectrogram( @@ -629,16 +642,17 @@ class TrainerTTS: alignment, output_fig=False) self.tb_logger.tb_test_audios(self.total_steps_done, test_audios, - self.config.audio["sample_rate"]) + self.config.audio["sample_rate"]) self.tb_logger.tb_test_figures(self.total_steps_done, test_figures) - def _get_cond_inputs(self): + def _get_cond_inputs(self) -> Dict: # setup speaker_id speaker_id = 0 if self.config.use_speaker_embedding else None # setup x_vector - x_vector = self.speaker_manager.get_x_vectors_by_speaker( - self.speaker_manager.speaker_ids[0] - ) if self.config.use_external_speaker_embedding_file and self.config.use_speaker_embedding else None + x_vector = (self.speaker_manager.get_x_vectors_by_speaker( + self.speaker_manager.speaker_ids[0]) + if self.config.use_external_speaker_embedding_file + and self.config.use_speaker_embedding else None) # setup style_mel if self.config.has('gst_style_input'): style_wav = self.config.gst_style_input @@ -647,35 +661,40 @@ class TrainerTTS: if style_wav is None and 'use_gst' in self.config and self.config.use_gst: # inicialize GST with zero dict. style_wav = {} - print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") + print( + "WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!" + ) for i in range(self.config.gst["gst_num_style_tokens"]): style_wav[str(i)] = 0 - cond_inputs = {'speaker_id': speaker_id, 'style_wav': style_wav, 'x_vector': x_vector} + cond_inputs = { + "speaker_id": speaker_id, + "style_wav": style_wav, + "x_vector": x_vector + } return cond_inputs - def fit(self): + def fit(self) -> None: if self.restore_step != 0 or self.args.best_path: - logging.info(" > Restoring best loss from " - f"{os.path.basename(self.args.best_path)} ...") + print(" > Restoring best loss from " + f"{os.path.basename(self.args.best_path)} ...") self.best_loss = torch.load(self.args.best_path, map_location="cpu")["model_loss"] - logging.info( - f" > Starting with loaded last best loss {self.best_loss}.") + print(f" > Starting with loaded last best loss {self.best_loss}.") # define data loaders - self.train_loader = self.setup_train_dataloader( + self.train_loader = self.get_train_dataloader( self.config.r, self.ap, self.data_train, verbose=True, speaker_mapping=self.speaker_manager.speaker_ids) - self.eval_loader = self.setup_eval_dataloder( + self.eval_loader = (self.get_eval_dataloder( self.config.r, self.ap, self.data_train, verbose=True, - speaker_mapping=self.speaker_manager.speaker_ids - ) if self.config.run_eval else None + speaker_mapping=self.speaker_manager.speaker_ids) + if self.config.run_eval else None) self.total_steps_done = self.restore_step @@ -697,10 +716,10 @@ class TrainerTTS: self.save_best_model() self.on_epoch_end() - def save_best_model(self): + def save_best_model(self) -> None: self.best_loss = save_best_model( - self.keep_avg_eval['avg_loss'] - if self.keep_avg_eval else self.keep_avg_train['avg_loss'], + self.keep_avg_eval["avg_loss"] + if self.keep_avg_eval else self.keep_avg_train["avg_loss"], self.best_loss, self.model, self.optimizer, @@ -715,8 +734,16 @@ class TrainerTTS: if self.config.mixed_precision else None, ) - def on_epoch_start(self): - if hasattr(self.model, 'on_epoch_start'): + @staticmethod + def _setup_logger_config(log_file: str) -> None: + logging.basicConfig( + level=logging.INFO, + format="", + handlers=[logging.FileHandler(log_file), + logging.StreamHandler()]) + + def on_epoch_start(self) -> None: # pylint: disable=no-self-use + if hasattr(self.model, "on_epoch_start"): self.model.on_epoch_start(self) if hasattr(self.criterion, "on_epoch_start"): @@ -725,8 +752,8 @@ class TrainerTTS: if hasattr(self.optimizer, "on_epoch_start"): self.optimizer.on_epoch_start(self) - def on_epoch_end(self): - if hasattr(self.model, "on_epoch_start"): + def on_epoch_end(self) -> None: # pylint: disable=no-self-use + if hasattr(self.model, "on_epoch_end"): self.model.on_epoch_end(self) if hasattr(self.criterion, "on_epoch_end"): @@ -735,8 +762,8 @@ class TrainerTTS: if hasattr(self.optimizer, "on_epoch_end"): self.optimizer.on_epoch_end(self) - def on_train_step_start(self): - if hasattr(self.model, "on_epoch_start"): + def on_train_step_start(self) -> None: # pylint: disable=no-self-use + if hasattr(self.model, "on_train_step_start"): self.model.on_train_step_start(self) if hasattr(self.criterion, "on_train_step_start"): @@ -745,7 +772,7 @@ class TrainerTTS: if hasattr(self.optimizer, "on_train_step_start"): self.optimizer.on_train_step_start(self) - def on_train_step_end(self): + def on_train_step_end(self) -> None: # pylint: disable=no-self-use if hasattr(self.model, "on_train_step_end"): self.model.on_train_step_end(self) From 9765b1aa6bf66f12bffab1fc70c730f96c217968 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 26 May 2021 09:52:29 +0200 Subject: [PATCH 019/258] update `glow_tts_config.py` for setting the optimizer and the scheduler --- TTS/tts/configs/glow_tts_config.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/TTS/tts/configs/glow_tts_config.py b/TTS/tts/configs/glow_tts_config.py index 36ccb612..214b2377 100644 --- a/TTS/tts/configs/glow_tts_config.py +++ b/TTS/tts/configs/glow_tts_config.py @@ -89,12 +89,13 @@ class GlowTTSConfig(BaseTTSConfig): use_external_speaker_embedding_file: bool = False external_speaker_embedding_file: str = False - # optimizer params - noam_schedule: bool = True - warmup_steps: int = 4000 + # optimizer parameters + optimizer: str = "RAdam" + optimizer_params: dict = field(default_factory=lambda: {'betas': [0.9, 0.998], 'weight_decay': 1e-6}) + lr_scheduler: str = "NoamLR" + lr_scheduler_params: dict = field(default_factory=lambda:{"warmup_steps": 4000}) grad_clip: float = 5.0 lr: float = 1e-3 - wd: float = 0.000001 # overrides min_seq_len: int = 3 From 3346a6d9dce5dcef57d9dad10906303387a2a8ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 26 May 2021 09:53:27 +0200 Subject: [PATCH 020/258] update `sequence_mask` import globally --- TTS/tts/layers/glow_tts/encoder.py | 2 +- TTS/tts/layers/glow_tts/monotonic_align/__init__.py | 2 +- TTS/tts/models/align_tts.py | 2 +- TTS/tts/models/glow_tts.py | 5 ++++- TTS/tts/models/speedy_speech.py | 2 +- tests/tts_tests/test_feed_forward_layers.py | 2 +- tests/tts_tests/test_speedy_speech_layers.py | 2 +- tests/tts_tests/test_tacotron_layers.py | 2 +- 8 files changed, 11 insertions(+), 8 deletions(-) diff --git a/TTS/tts/layers/glow_tts/encoder.py b/TTS/tts/layers/glow_tts/encoder.py index 48bb3008..71aee94f 100644 --- a/TTS/tts/layers/glow_tts/encoder.py +++ b/TTS/tts/layers/glow_tts/encoder.py @@ -9,7 +9,7 @@ from TTS.tts.layers.generic.time_depth_sep_conv import TimeDepthSeparableConvBlo from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor from TTS.tts.layers.glow_tts.glow import ResidualConv1dLayerNormBlock from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask class Encoder(nn.Module): diff --git a/TTS/tts/layers/glow_tts/monotonic_align/__init__.py b/TTS/tts/layers/glow_tts/monotonic_align/__init__.py index 7be124f4..5cbfd8fc 100644 --- a/TTS/tts/layers/glow_tts/monotonic_align/__init__.py +++ b/TTS/tts/layers/glow_tts/monotonic_align/__init__.py @@ -2,7 +2,7 @@ import numpy as np import torch from torch.nn import functional as F -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask try: # TODO: fix pypi cython installation problem. diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index e097ac50..db04b72c 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -7,7 +7,7 @@ from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask class AlignTTS(nn.Module): diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 19eb594a..ca059ab9 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -6,8 +6,11 @@ from torch.nn import functional as F from TTS.tts.layers.glow_tts.decoder import Decoder from TTS.tts.layers.glow_tts.encoder import Encoder +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask class GlowTTS(nn.Module): diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index 9880b82b..bc6e912c 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -6,7 +6,7 @@ from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask class SpeedySpeech(nn.Module): diff --git a/tests/tts_tests/test_feed_forward_layers.py b/tests/tts_tests/test_feed_forward_layers.py index 1db980a3..1c2d3803 100644 --- a/tests/tts_tests/test_feed_forward_layers.py +++ b/tests/tts_tests/test_feed_forward_layers.py @@ -2,7 +2,7 @@ import torch from TTS.tts.layers.feed_forward.decoder import Decoder from TTS.tts.layers.feed_forward.encoder import Encoder -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") diff --git a/tests/tts_tests/test_speedy_speech_layers.py b/tests/tts_tests/test_speedy_speech_layers.py index 3473769b..21a73812 100644 --- a/tests/tts_tests/test_speedy_speech_layers.py +++ b/tests/tts_tests/test_speedy_speech_layers.py @@ -2,7 +2,7 @@ import torch from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor from TTS.tts.models.speedy_speech import SpeedySpeech -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") diff --git a/tests/tts_tests/test_tacotron_layers.py b/tests/tts_tests/test_tacotron_layers.py index 9b89e645..6c4b76b5 100644 --- a/tests/tts_tests/test_tacotron_layers.py +++ b/tests/tts_tests/test_tacotron_layers.py @@ -4,7 +4,7 @@ import torch as T from TTS.tts.layers.losses import L1LossMasked, SSIMLoss from TTS.tts.layers.tacotron.tacotron import CBHG, Decoder, Encoder, Prenet -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask # pylint: disable=unused-variable From f09ec7e3a7a50c82fcef65f363e38bf7c6c964ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 26 May 2021 09:54:04 +0200 Subject: [PATCH 021/258] update glow-tts for the trainer --- TTS/tts/models/glow_tts.py | 194 +++++++++++++++++++++++++++++-------- 1 file changed, 153 insertions(+), 41 deletions(-) diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index ca059ab9..09e58ce7 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -38,7 +38,6 @@ class GlowTTS(nn.Module): encoder_params (dict): encoder module parameters. speaker_embedding_dim (int): channels of external speaker embedding vectors. """ - def __init__( self, num_chars, @@ -133,27 +132,29 @@ class GlowTTS(nn.Module): @staticmethod def compute_outputs(attn, o_mean, o_log_scale, x_mask): # compute final values with the computed alignment - y_mean = torch.matmul(attn.squeeze(1).transpose(1, 2), o_mean.transpose(1, 2)).transpose( - 1, 2 - ) # [b, t', t], [b, t, d] -> [b, d, t'] - y_log_scale = torch.matmul(attn.squeeze(1).transpose(1, 2), o_log_scale.transpose(1, 2)).transpose( - 1, 2 - ) # [b, t', t], [b, t, d] -> [b, d, t'] + y_mean = torch.matmul( + attn.squeeze(1).transpose(1, 2), o_mean.transpose(1, 2)).transpose( + 1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] + y_log_scale = torch.matmul( + attn.squeeze(1).transpose(1, 2), o_log_scale.transpose( + 1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] # compute total duration with adjustment o_attn_dur = torch.log(1 + torch.sum(attn, -1)) * x_mask return y_mean, y_log_scale, o_attn_dur - def forward(self, x, x_lengths, y=None, y_lengths=None, attn=None, g=None): + def forward(self, x, x_lengths, y, y_lengths=None, cond_input={'x_vectors':None}): """ Shapes: x: [B, T] x_lenghts: B - y: [B, C, T] + y: [B, T, C] y_lengths: B g: [B, C] or B """ y_max_length = y.size(2) + y = y.transpose(1, 2) # norm speaker embeddings + g = cond_input['x_vectors'] if g is not None: if self.speaker_embedding_dim: g = F.normalize(g).unsqueeze(-1) @@ -161,29 +162,54 @@ class GlowTTS(nn.Module): g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] # embedding pass - o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g) + o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, + x_lengths, + g=g) # drop redisual frames wrt num_squeeze and set y_lengths. - y, y_lengths, y_max_length, attn = self.preprocess(y, y_lengths, y_max_length, None) + y, y_lengths, y_max_length, attn = self.preprocess( + y, y_lengths, y_max_length, None) # create masks - y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), + 1).to(x_mask.dtype) attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) # decoder pass z, logdet = self.decoder(y, y_mask, g=g, reverse=False) # find the alignment path with torch.no_grad(): o_scale = torch.exp(-2 * o_log_scale) - logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, [1]).unsqueeze(-1) # [b, t, 1] - logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * (z ** 2)) # [b, t, d] x [b, d, t'] = [b, t, t'] - logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), z) # [b, t, d] x [b, d, t'] = [b, t, t'] - logp4 = torch.sum(-0.5 * (o_mean ** 2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] + logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, + [1]).unsqueeze(-1) # [b, t, 1] + logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * + (z**2)) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), + z) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp4 = torch.sum(-0.5 * (o_mean**2) * o_scale, + [1]).unsqueeze(-1) # [b, t, 1] logp = logp1 + logp2 + logp3 + logp4 # [b, t, t'] - attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() - y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask) + attn = maximum_path(logp, + attn_mask.squeeze(1)).unsqueeze(1).detach() + y_mean, y_log_scale, o_attn_dur = self.compute_outputs( + attn, o_mean, o_log_scale, x_mask) attn = attn.squeeze(1).permute(0, 2, 1) - return z, logdet, y_mean, y_log_scale, attn, o_dur_log, o_attn_dur + outputs = { + 'model_outputs': z, + 'logdet': logdet, + 'y_mean': y_mean, + 'y_log_scale': y_log_scale, + 'alignments': attn, + 'durations_log': o_dur_log, + 'total_durations_log': o_attn_dur + } + return outputs @torch.no_grad() - def inference_with_MAS(self, x, x_lengths, y=None, y_lengths=None, attn=None, g=None): + def inference_with_MAS(self, + x, + x_lengths, + y=None, + y_lengths=None, + attn=None, + g=None): """ It's similar to the teacher forcing in Tacotron. It was proposed in: https://arxiv.org/abs/2104.05557 @@ -203,24 +229,33 @@ class GlowTTS(nn.Module): g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] # embedding pass - o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g) + o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, + x_lengths, + g=g) # drop redisual frames wrt num_squeeze and set y_lengths. - y, y_lengths, y_max_length, attn = self.preprocess(y, y_lengths, y_max_length, None) + y, y_lengths, y_max_length, attn = self.preprocess( + y, y_lengths, y_max_length, None) # create masks - y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), + 1).to(x_mask.dtype) attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) # decoder pass z, logdet = self.decoder(y, y_mask, g=g, reverse=False) # find the alignment path between z and encoder output o_scale = torch.exp(-2 * o_log_scale) - logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, [1]).unsqueeze(-1) # [b, t, 1] - logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * (z ** 2)) # [b, t, d] x [b, d, t'] = [b, t, t'] - logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), z) # [b, t, d] x [b, d, t'] = [b, t, t'] - logp4 = torch.sum(-0.5 * (o_mean ** 2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] + logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, + [1]).unsqueeze(-1) # [b, t, 1] + logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * + (z**2)) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), + z) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp4 = torch.sum(-0.5 * (o_mean**2) * o_scale, + [1]).unsqueeze(-1) # [b, t, 1] logp = logp1 + logp2 + logp3 + logp4 # [b, t, t'] attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() - y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask) + y_mean, y_log_scale, o_attn_dur = self.compute_outputs( + attn, o_mean, o_log_scale, x_mask) attn = attn.squeeze(1).permute(0, 2, 1) # get predited aligned distribution @@ -228,8 +263,16 @@ class GlowTTS(nn.Module): # reverse the decoder and predict using the aligned distribution y, logdet = self.decoder(z, y_mask, g=g, reverse=True) - - return y, logdet, y_mean, y_log_scale, attn, o_dur_log, o_attn_dur + outputs = { + 'model_outputs': y, + 'logdet': logdet, + 'y_mean': y_mean, + 'y_log_scale': y_log_scale, + 'alignments': attn, + 'durations_log': o_dur_log, + 'total_durations_log': o_attn_dur + } + return outputs @torch.no_grad() def decoder_inference(self, y, y_lengths=None, g=None): @@ -247,7 +290,8 @@ class GlowTTS(nn.Module): else: g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] - y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(y.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), + 1).to(y.dtype) # decoder pass z, logdet = self.decoder(y, y_mask, g=g, reverse=False) @@ -266,28 +310,98 @@ class GlowTTS(nn.Module): g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h] # embedding pass - o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g) + o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, + x_lengths, + g=g) # compute output durations w = (torch.exp(o_dur_log) - 1) * x_mask * self.length_scale w_ceil = torch.ceil(w) y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() y_max_length = None # compute masks - y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), + 1).to(x_mask.dtype) attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) # compute attention mask - attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1) - y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask) + attn = generate_path(w_ceil.squeeze(1), + attn_mask.squeeze(1)).unsqueeze(1) + y_mean, y_log_scale, o_attn_dur = self.compute_outputs( + attn, o_mean, o_log_scale, x_mask) - z = (y_mean + torch.exp(y_log_scale) * torch.randn_like(y_mean) * self.inference_noise_scale) * y_mask + z = (y_mean + torch.exp(y_log_scale) * torch.randn_like(y_mean) * + self.inference_noise_scale) * y_mask # decoder pass y, logdet = self.decoder(z, y_mask, g=g, reverse=True) attn = attn.squeeze(1).permute(0, 2, 1) - return y, logdet, y_mean, y_log_scale, attn, o_dur_log, o_attn_dur + outputs = { + 'model_outputs': y, + 'logdet': logdet, + 'y_mean': y_mean, + 'y_log_scale': y_log_scale, + 'alignments': attn, + 'durations_log': o_dur_log, + 'total_durations_log': o_attn_dur + } + return outputs + + def train_step(self, batch: dict, criterion: nn.Module): + """Perform a single training step by fetching the right set if samples from the batch. + + Args: + batch (dict): [description] + criterion (nn.Module): [description] + """ + text_input = batch['text_input'] + text_lengths = batch['text_lengths'] + mel_input = batch['mel_input'] + mel_lengths = batch['mel_lengths'] + x_vectors = batch['x_vectors'] + + outputs = self.forward(text_input, + text_lengths, + mel_input, + mel_lengths, + cond_input={"x_vectors": x_vectors}) + + loss_dict = criterion(outputs['model_outputs'], outputs['y_mean'], + outputs['y_log_scale'], outputs['logdet'], + mel_lengths, outputs['durations_log'], + outputs['total_durations_log'], text_lengths) + + # compute alignment error (the lower the better ) + align_error = 1 - alignment_diagonal_score(outputs['alignments'], binary=True) + loss_dict["align_error"] = align_error + return outputs, loss_dict + + def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): + model_outputs = outputs['model_outputs'] + alignments = outputs['alignments'] + mel_input = batch['mel_input'] + + pred_spec = model_outputs[0].data.cpu().numpy() + gt_spec = mel_input[0].data.cpu().numpy() + align_img = alignments[0].data.cpu().numpy() + + figures = { + "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), + "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), + "alignment": plot_alignment(align_img, output_fig=False), + } + + # Sample audio + train_audio = ap.inv_melspectrogram(pred_spec.T) + return figures, train_audio + + def eval_step(self, batch: dict, criterion: nn.Module): + return self.train_step(batch, criterion) + + def eval_log(self, ap: AudioProcessor, batch: dict, outputs: dict): + return self.train_log(ap, batch, outputs) def preprocess(self, y, y_lengths, y_max_length, attn=None): if y_max_length is not None: - y_max_length = (y_max_length // self.num_squeeze) * self.num_squeeze + y_max_length = (y_max_length // + self.num_squeeze) * self.num_squeeze y = y[:, :, :y_max_length] if attn is not None: attn = attn[:, :, :, :y_max_length] @@ -297,9 +411,7 @@ class GlowTTS(nn.Module): def store_inverse(self): self.decoder.store_inverse() - def load_checkpoint( - self, config, checkpoint_path, eval=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin state = torch.load(checkpoint_path, map_location=torch.device("cpu")) self.load_state_dict(state["model"]) if eval: From c9790bee2c058adb7c09ff5318d5c6513820c7dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 26 May 2021 09:54:48 +0200 Subject: [PATCH 022/258] update tacotron model to return `model_outputs` --- TTS/tts/models/tacotron.py | 8 ++++---- TTS/tts/models/tacotron2.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 23bd839f..34f04159 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -255,7 +255,7 @@ class Tacotron(TacotronAbstract): outputs['alignments_backward'] = alignments_backward outputs['decoder_outputs_backward'] = decoder_outputs_backward outputs.update({ - 'postnet_outputs': postnet_outputs, + 'model_outputs': postnet_outputs, 'decoder_outputs': decoder_outputs, 'alignments': alignments, 'stop_tokens': stop_tokens @@ -287,7 +287,7 @@ class Tacotron(TacotronAbstract): postnet_outputs = self.last_linear(postnet_outputs) decoder_outputs = decoder_outputs.transpose(1, 2) outputs = { - 'postnet_outputs': postnet_outputs, + 'model_outputs': postnet_outputs, 'decoder_outputs': decoder_outputs, 'alignments': alignments, 'stop_tokens': stop_tokens @@ -335,7 +335,7 @@ class Tacotron(TacotronAbstract): # compute loss loss_dict = criterion( - outputs['postnet_outputs'], + outputs['model_outputs'], outputs['decoder_outputs'], mel_input, linear_input, @@ -355,7 +355,7 @@ class Tacotron(TacotronAbstract): return outputs, loss_dict def train_log(self, ap, batch, outputs): - postnet_outputs = outputs['postnet_outputs'] + postnet_outputs = outputs['model_outputs'] alignments = outputs['alignments'] alignments_backward = outputs['alignments_backward'] mel_input = batch['mel_input'] diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 51b181e4..04b97606 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -233,7 +233,7 @@ class Tacotron2(TacotronAbstract): outputs['alignments_backward'] = alignments_backward outputs['decoder_outputs_backward'] = decoder_outputs_backward outputs.update({ - 'postnet_outputs': postnet_outputs, + 'model_outputs': postnet_outputs, 'decoder_outputs': decoder_outputs, 'alignments': alignments, 'stop_tokens': stop_tokens @@ -254,7 +254,7 @@ class Tacotron2(TacotronAbstract): x_vector = self.speaker_embedding(cond_input['speaker_ids'])[:, None] x_vector = torch.unsqueeze(x_vector, 0).transpose(1, 2) else: - x_vector = cond_input + x_vector = cond_input['x_vectors'] encoder_outputs = self._concat_speaker_embedding( encoder_outputs, x_vector) @@ -266,7 +266,7 @@ class Tacotron2(TacotronAbstract): decoder_outputs, postnet_outputs, alignments = self.shape_outputs( decoder_outputs, postnet_outputs, alignments) outputs = { - 'postnet_outputs': postnet_outputs, + 'model_outputs': postnet_outputs, 'decoder_outputs': decoder_outputs, 'alignments': alignments, 'stop_tokens': stop_tokens From 843b3ba9603bd5219b873922381ddc1ce792d969 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 26 May 2021 16:02:01 +0200 Subject: [PATCH 023/258] update `speedy_speecy_config.py` for the trainer --- TTS/tts/configs/speedy_speech_config.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/TTS/tts/configs/speedy_speech_config.py b/TTS/tts/configs/speedy_speech_config.py index 1b8f0c82..42258398 100644 --- a/TTS/tts/configs/speedy_speech_config.py +++ b/TTS/tts/configs/speedy_speech_config.py @@ -1,4 +1,5 @@ from dataclasses import dataclass, field +from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig @@ -99,10 +100,11 @@ class SpeedySpeechConfig(BaseTTSConfig): external_speaker_embedding_file: str = False # optimizer parameters - noam_schedule: bool = False - warmup_steps: int = 4000 + optimizer: str = "RAdam" + optimizer_params: dict = field(default_factory=lambda: {'betas': [0.9, 0.998], 'weight_decay': 1e-6}) + lr_scheduler: str = None + lr_scheduler_params: dict = None lr: float = 1e-4 - wd: float = 1e-6 grad_clip: float = 5.0 # loss params @@ -114,3 +116,12 @@ class SpeedySpeechConfig(BaseTTSConfig): min_seq_len: int = 13 max_seq_len: int = 200 r: int = 1 # DO NOT CHANGE + + # testing + test_sentences: List[str] = field(default_factory=lambda:[ + "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + "Be a voice, not an echo.", + "I'm sorry Dave. I'm afraid I can't do that.", + "This cake is great. It's so delicious and moist.", + "Prior to November 22, 1963." + ]) From f121b0ff5d55c7ad373bd2fe343c1996a0a5c0cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 26 May 2021 16:03:24 +0200 Subject: [PATCH 024/258] update `speedy_speech.py` model for trainer --- TTS/tts/models/speedy_speech.py | 139 +++++++++++++++++++++++++++----- 1 file changed, 121 insertions(+), 18 deletions(-) diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index bc6e912c..daf67b6c 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -3,6 +3,9 @@ from torch import nn from TTS.tts.layers.feed_forward.decoder import Decoder from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path @@ -46,7 +49,12 @@ class SpeedySpeech(nn.Module): positional_encoding=True, length_scale=1, encoder_type="residual_conv_bn", - encoder_params={"kernel_size": 4, "dilations": 4 * [1, 2, 4] + [1], "num_conv_blocks": 2, "num_res_blocks": 13}, + encoder_params={ + "kernel_size": 4, + "dilations": 4 * [1, 2, 4] + [1], + "num_conv_blocks": 2, + "num_res_blocks": 13 + }, decoder_type="residual_conv_bn", decoder_params={ "kernel_size": 4, @@ -60,13 +68,17 @@ class SpeedySpeech(nn.Module): ): super().__init__() - self.length_scale = float(length_scale) if isinstance(length_scale, int) else length_scale + self.length_scale = float(length_scale) if isinstance( + length_scale, int) else length_scale self.emb = nn.Embedding(num_chars, hidden_channels) - self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, encoder_params, c_in_channels) + self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, + encoder_params, c_in_channels) if positional_encoding: self.pos_encoder = PositionalEncoding(hidden_channels) - self.decoder = Decoder(out_channels, hidden_channels, decoder_type, decoder_params) - self.duration_predictor = DurationPredictor(hidden_channels + c_in_channels) + self.decoder = Decoder(out_channels, hidden_channels, decoder_type, + decoder_params) + self.duration_predictor = DurationPredictor(hidden_channels + + c_in_channels) if num_speakers > 1 and not external_c: # speaker embedding layer @@ -93,7 +105,9 @@ class SpeedySpeech(nn.Module): """ attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) attn = generate_path(dr, attn_mask.squeeze(1)).to(en.dtype) - o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2), en.transpose(1, 2)).transpose(1, 2) + o_en_ex = torch.matmul( + attn.squeeze(1).transpose(1, 2), en.transpose(1, + 2)).transpose(1, 2) return o_en_ex, attn def format_durations(self, o_dr_log, x_mask): @@ -127,7 +141,8 @@ class SpeedySpeech(nn.Module): x_emb = torch.transpose(x_emb, 1, -1) # compute sequence masks - x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).to(x.dtype) + x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), + 1).to(x.dtype) # encoder pass o_en = self.encoder(x_emb, x_mask) @@ -140,7 +155,8 @@ class SpeedySpeech(nn.Module): return o_en, o_en_dp, x_mask, g def _forward_decoder(self, o_en, o_en_dp, dr, x_mask, y_lengths, g): - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), + 1).to(o_en_dp.dtype) # expand o_en with durations o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask) # positional encoding @@ -153,8 +169,17 @@ class SpeedySpeech(nn.Module): o_de = self.decoder(o_en_ex, y_mask, g=g) return o_de, attn.transpose(1, 2) - def forward(self, x, x_lengths, y_lengths, dr, g=None): # pylint: disable=unused-argument + def forward(self, + x, + x_lengths, + y_lengths, + dr, + cond_input={ + 'x_vectors': None, + 'speaker_ids': None + }): # pylint: disable=unused-argument """ + TODO: speaker embedding for speaker_ids Shapes: x: [B, T_max] x_lengths: [B] @@ -162,35 +187,113 @@ class SpeedySpeech(nn.Module): dr: [B, T_max] g: [B, C] """ + g = cond_input['x_vectors'] if 'x_vectors' in cond_input else None o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) - o_de, attn = self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g) - return o_de, o_dr_log.squeeze(1), attn + o_de, attn = self._forward_decoder(o_en, + o_en_dp, + dr, + x_mask, + y_lengths, + g=g) + outputs = { + 'model_outputs': o_de.transpose(1, 2), + 'durations_log': o_dr_log.squeeze(1), + 'alignments': attn + } + return outputs - def inference(self, x, x_lengths, g=None): # pylint: disable=unused-argument + def inference(self, + x, + cond_input={ + 'x_vectors': None, + 'speaker_ids': None + }): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] x_lengths: [B] g: [B, C] """ + g = cond_input['x_vectors'] if 'x_vectors' in cond_input else None + x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # input sequence should be greated than the max convolution size inference_padding = 5 if x.shape[1] < 13: inference_padding += 13 - x.shape[1] # pad input to prevent dropping the last word - x = torch.nn.functional.pad(x, pad=(0, inference_padding), mode="constant", value=0) + x = torch.nn.functional.pad(x, + pad=(0, inference_padding), + mode="constant", + value=0) o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) # duration predictor pass o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1) y_lengths = o_dr.sum(1) - o_de, attn = self._forward_decoder(o_en, o_en_dp, o_dr, x_mask, y_lengths, g=g) - return o_de, attn + o_de, attn = self._forward_decoder(o_en, + o_en_dp, + o_dr, + x_mask, + y_lengths, + g=g) + outputs = { + 'model_outputs': o_de.transpose(1, 2), + 'alignments': attn, + 'durations_log': None + } + return outputs - def load_checkpoint( - self, config, checkpoint_path, eval=False - ): # pylint: disable=unused-argument, redefined-builtin + def train_step(self, batch: dict, criterion: nn.Module): + text_input = batch['text_input'] + text_lengths = batch['text_lengths'] + mel_input = batch['mel_input'] + mel_lengths = batch['mel_lengths'] + x_vectors = batch['x_vectors'] + speaker_ids = batch['speaker_ids'] + durations = batch['durations'] + + cond_input = {'x_vectors': x_vectors, 'speaker_ids': speaker_ids} + outputs = self.forward(text_input, text_lengths, mel_lengths, + durations, cond_input) + + # compute loss + loss_dict = criterion(outputs['model_outputs'], mel_input, + mel_lengths, outputs['durations_log'], + torch.log(1 + durations), text_lengths) + + # compute alignment error (the lower the better ) + align_error = 1 - alignment_diagonal_score(outputs['alignments'], + binary=True) + loss_dict["align_error"] = align_error + return outputs, loss_dict + + def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): + model_outputs = outputs['model_outputs'] + alignments = outputs['alignments'] + mel_input = batch['mel_input'] + + pred_spec = model_outputs[0].data.cpu().numpy() + gt_spec = mel_input[0].data.cpu().numpy() + align_img = alignments[0].data.cpu().numpy() + + figures = { + "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), + "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), + "alignment": plot_alignment(align_img, output_fig=False), + } + + # Sample audio + train_audio = ap.inv_melspectrogram(pred_spec.T) + return figures, train_audio + + def eval_step(self, batch: dict, criterion: nn.Module): + return self.train_step(batch, criterion) + + def eval_log(self, ap: AudioProcessor, batch: dict, outputs: dict): + return self.train_log(ap, batch, outputs) + + def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin state = torch.load(checkpoint_path, map_location=torch.device("cpu")) self.load_state_dict(state["model"]) if eval: From fb9289d365ab331aabd50013a8701421ed0fa416 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 26 May 2021 16:03:56 +0200 Subject: [PATCH 025/258] update `synthesis.py` for being more generic --- TTS/tts/utils/synthesis.py | 58 ++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 33 deletions(-) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 4c3331c8..67432320 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -152,16 +152,6 @@ def run_model_tflite(model, inputs, CONFIG, speaker_id=None, style_mel=None): return decoder_output, postnet_output, None, None -def parse_outputs_torch(postnet_output, decoder_output, alignments, - stop_tokens): - postnet_output = postnet_output[0].data.cpu().numpy() - decoder_output = None if decoder_output is None else decoder_output[ - 0].data.cpu().numpy() - alignment = alignments[0].cpu().data.numpy() - stop_tokens = None if stop_tokens is None else stop_tokens[0].cpu().numpy() - return postnet_output, decoder_output, alignment, stop_tokens - - def parse_outputs_tf(postnet_output, decoder_output, alignments, stop_tokens): postnet_output = postnet_output[0].numpy() decoder_output = decoder_output[0].numpy() @@ -200,8 +190,8 @@ def speaker_id_to_torch(speaker_id, cuda=False): def embedding_to_torch(x_vector, cuda=False): if x_vector is not None: x_vector = np.asarray(x_vector) - x_vector = torch.from_numpy(x_vector).unsqueeze( - 0).type(torch.FloatTensor) + x_vector = torch.from_numpy(x_vector).unsqueeze(0).type( + torch.FloatTensor) if cuda: return x_vector.cuda() return x_vector @@ -263,57 +253,59 @@ def synthesis( else: style_mel = compute_style_mel(style_wav, ap, cuda=use_cuda) # preprocess the given text - inputs = text_to_seq(text, CONFIG) + text_inputs = text_to_seq(text, CONFIG) # pass tensors to backend if backend == "torch": if speaker_id is not None: speaker_id = speaker_id_to_torch(speaker_id, cuda=use_cuda) if x_vector is not None: - x_vector = embedding_to_torch(x_vector, - cuda=use_cuda) + x_vector = embedding_to_torch(x_vector, cuda=use_cuda) if not isinstance(style_mel, dict): style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) - inputs = numpy_to_torch(inputs, torch.long, cuda=use_cuda) - inputs = inputs.unsqueeze(0) + text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=use_cuda) + text_inputs = text_inputs.unsqueeze(0) elif backend == "tf": # TODO: handle speaker id for tf model style_mel = numpy_to_tf(style_mel, tf.float32) - inputs = numpy_to_tf(inputs, tf.int32) - inputs = tf.expand_dims(inputs, 0) + text_inputs = numpy_to_tf(text_inputs, tf.int32) + text_inputs = tf.expand_dims(text_inputs, 0) elif backend == "tflite": style_mel = numpy_to_tf(style_mel, tf.float32) - inputs = numpy_to_tf(inputs, tf.int32) - inputs = tf.expand_dims(inputs, 0) + text_inputs = numpy_to_tf(text_inputs, tf.int32) + text_inputs = tf.expand_dims(text_inputs, 0) # synthesize voice if backend == "torch": outputs = run_model_torch(model, - inputs, + text_inputs, speaker_id, style_mel, x_vector=x_vector) - postnet_output, decoder_output, alignments, stop_tokens = \ - outputs['postnet_outputs'], outputs['decoder_outputs'],\ - outputs['alignments'], outputs['stop_tokens'] - postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_torch( - postnet_output, decoder_output, alignments, stop_tokens) + model_outputs = outputs['model_outputs'] + model_outputs = model_outputs[0].data.cpu().numpy() elif backend == "tf": decoder_output, postnet_output, alignments, stop_tokens = run_model_tf( - model, inputs, CONFIG, speaker_id, style_mel) - postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_tf( + model, text_inputs, CONFIG, speaker_id, style_mel) + model_outputs, decoder_output, alignment, stop_tokens = parse_outputs_tf( postnet_output, decoder_output, alignments, stop_tokens) elif backend == "tflite": decoder_output, postnet_output, alignment, stop_tokens = run_model_tflite( - model, inputs, CONFIG, speaker_id, style_mel) - postnet_output, decoder_output = parse_outputs_tflite( + model, text_inputs, CONFIG, speaker_id, style_mel) + model_outputs, decoder_output = parse_outputs_tflite( postnet_output, decoder_output) # convert outputs to numpy # plot results wav = None if use_griffin_lim: - wav = inv_spectrogram(postnet_output, ap, CONFIG) + wav = inv_spectrogram(model_outputs, ap, CONFIG) # trim silence if do_trim_silence: wav = trim_silence(wav, ap) - return wav, alignment, decoder_output, postnet_output, stop_tokens, inputs + return_dict = { + 'wav': wav, + 'alignments': outputs['alignments'], + 'model_outputs': model_outputs, + 'text_inputs': text_inputs + } + return return_dict From 8dfd4c91ff300f83d6c7aed6b1fa2fc325859069 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 10:24:26 +0200 Subject: [PATCH 026/258] update trainer.py for better logging handling, restoring models and rename init_ functions with get_ --- TTS/bin/train_tts.py | 6 +++++- TTS/trainer.py | 22 ++++++++++------------ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 5058d341..7cc8a25f 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -10,7 +10,11 @@ def main(): # try: args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training( sys.argv) - trainer = TrainerTTS(args, config, c_logger, tb_logger, output_path=OUT_PATH) + trainer = TrainerTTS(args, + config, + c_logger, + tb_logger, + output_path=OUT_PATH) trainer.fit() # except KeyboardInterrupt: # remove_experiment_folder(OUT_PATH) diff --git a/TTS/trainer.py b/TTS/trainer.py index 3beb281f..6087f1bc 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -1,12 +1,13 @@ # -*- coding: utf-8 -*- +import importlib +import logging import os import sys import time import traceback +from logging import StreamHandler from random import randrange -import logging -import importlib import numpy as np import torch @@ -16,19 +17,19 @@ from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler -from TTS.tts.datasets import load_meta_data, TTSDataset +from TTS.tts.datasets import TTSDataset, load_meta_data from TTS.tts.layers import setup_loss from TTS.tts.models import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.arguments import init_training -from TTS.tts.utils.visual import plot_spectrogram, plot_alignment from TTS.utils.audio import AudioProcessor from TTS.utils.distribute import init_distributed, reduce_tensor -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict, find_module -from TTS.utils.training import setup_torch_training_env, check_update +from TTS.utils.generic_utils import KeepAverage, count_parameters, find_module, remove_experiment_folder, set_init_dict +from TTS.utils.training import check_update, setup_torch_training_env @dataclass @@ -140,9 +141,8 @@ class TrainerTTS: self.config, args.restore_path, self.model, self.optimizer, self.scaler) - if self.use_cuda: - self.model.cuda() - self.criterion.cuda() + # setup scheduler + self.scheduler = self.get_scheduler(self.config, self.optimizer) # DISTRUBUTED if self.num_gpus > 1: @@ -150,8 +150,7 @@ class TrainerTTS: # count model size num_params = count_parameters(self.model) - logging.info("\n > Model has {} parameters".format(num_params), - flush=True) + logging.info("\n > Model has {} parameters".format(num_params)) @staticmethod def get_model(num_chars: int, num_speakers: int, config: Coqpit, @@ -241,7 +240,6 @@ class TrainerTTS: try: logging.info(" > Restoring Model...") model.load_state_dict(checkpoint["model"]) - # optimizer restore logging.info(" > Restoring Optimizer...") optimizer.load_state_dict(checkpoint["optimizer"]) if "scaler" in checkpoint and config.mixed_precision: From 8213ad8b5f9fd58897a0be131c9304fe466088a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 10:25:25 +0200 Subject: [PATCH 027/258] update aling_tts_config for the trainer --- TTS/tts/configs/align_tts_config.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/TTS/tts/configs/align_tts_config.py b/TTS/tts/configs/align_tts_config.py index 2956d935..115e969c 100644 --- a/TTS/tts/configs/align_tts_config.py +++ b/TTS/tts/configs/align_tts_config.py @@ -92,13 +92,24 @@ class AlignTTSConfig(BaseTTSConfig): external_speaker_embedding_file: str = False # optimizer parameters - noam_schedule: bool = False - warmup_steps: int = 4000 + optimizer: str = "Adam" + optimizer_params: dict = field(default_factory=lambda: {'betas': [0.9, 0.998], 'weight_decay': 1e-6}) + lr_scheduler: str = None + lr_scheduler_params: dict = None lr: float = 1e-4 - wd: float = 1e-6 grad_clip: float = 5.0 # overrides min_seq_len: int = 13 max_seq_len: int = 200 r: int = 1 + + # testing + test_sentences: List[str] = field(default_factory=lambda:[ + "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + "Be a voice, not an echo.", + "I'm sorry Dave. I'm afraid I can't do that.", + "This cake is great. It's so delicious and moist.", + "Prior to November 22, 1963." + ]) + From 4f66e816d16ad72f1825d89e0e8de7f5d72e2385 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 10:25:40 +0200 Subject: [PATCH 028/258] update align_tts_loss for trainer --- TTS/tts/layers/losses.py | 42 +++++----------------------------------- 1 file changed, 5 insertions(+), 37 deletions(-) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 27c6e9e5..517eb533 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -462,13 +462,12 @@ class MDNLoss(nn.Module): class AlignTTSLoss(nn.Module): """Modified AlignTTS Loss. - Computes following losses + Computes - L1 and SSIM losses from output spectrograms. - Huber loss for duration predictor. - MDNLoss for Mixture of Density Network. - All the losses are aggregated by a weighted sum with the loss alphas. - Alphas can be scheduled based on number of steps. + All loss values are aggregated by a weighted sum of the alpha values. Args: c (dict): TTS model configuration. @@ -487,9 +486,9 @@ class AlignTTSLoss(nn.Module): self.mdn_alpha = c.mdn_alpha def forward( - self, logp, decoder_output, decoder_target, decoder_output_lens, dur_output, dur_target, input_lens, step, phase + self, logp, decoder_output, decoder_target, decoder_output_lens, dur_output, dur_target, input_lens, phase ): - ssim_alpha, dur_loss_alpha, spec_loss_alpha, mdn_alpha = self.set_alphas(step) + # ssim_alpha, dur_loss_alpha, spec_loss_alpha, mdn_alpha = self.set_alphas(step) spec_loss, ssim_loss, dur_loss, mdn_loss = 0, 0, 0, 0 if phase == 0: mdn_loss = self.mdn_loss(logp, input_lens, decoder_output_lens) @@ -507,36 +506,5 @@ class AlignTTSLoss(nn.Module): spec_loss = self.spec_loss(decoder_output, decoder_target, decoder_output_lens) ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens) dur_loss = self.dur_loss(dur_output.unsqueeze(2), dur_target.unsqueeze(2), input_lens) - loss = spec_loss_alpha * spec_loss + ssim_alpha * ssim_loss + dur_loss_alpha * dur_loss + mdn_alpha * mdn_loss + loss = self.spec_loss_alpha * spec_loss + self.ssim_alpha * ssim_loss + self.dur_loss_alpha * dur_loss + self.mdn_alpha * mdn_loss return {"loss": loss, "loss_l1": spec_loss, "loss_ssim": ssim_loss, "loss_dur": dur_loss, "mdn_loss": mdn_loss} - - @staticmethod - def _set_alpha(step, alpha_settings): - """Set the loss alpha wrt number of steps. - Return the corresponding value if no schedule is set. - - Example: - Setting a alpha schedule. - if ```alpha_settings``` is ```[[0, 1], [10000, 0.1]]``` then ```return_alpha == 1``` until 10k steps, then set to 0.1. - if ```alpha_settings``` is a constant value then ```return_alpha``` is set to that constant. - - Args: - step (int): number of training steps. - alpha_settings (int or list): constant alpha value or a list defining the schedule as explained above. - """ - return_alpha = None - if isinstance(alpha_settings, list): - for key, alpha in alpha_settings: - if key < step: - return_alpha = alpha - elif isinstance(alpha_settings, (float, int)): - return_alpha = alpha_settings - return return_alpha - - def set_alphas(self, step): - """Set the alpha values for all the loss functions""" - ssim_alpha = self._set_alpha(step, self.ssim_alpha) - dur_loss_alpha = self._set_alpha(step, self.dur_loss_alpha) - spec_loss_alpha = self._set_alpha(step, self.spec_loss_alpha) - mdn_alpha = self._set_alpha(step, self.mdn_alpha) - return ssim_alpha, dur_loss_alpha, spec_loss_alpha, mdn_alpha From c82d91051de2ea2720d7dfa8710a509df30483ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 10:26:09 +0200 Subject: [PATCH 029/258] update align_tts.py model for the trainer --- TTS/tts/models/align_tts.py | 198 ++++++++++++++++++++++++++++++------ 1 file changed, 168 insertions(+), 30 deletions(-) diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index db04b72c..6d61eae2 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -4,6 +4,9 @@ import torch.nn as nn from TTS.tts.layers.align_tts.mdn import MDNBlock from TTS.tts.layers.feed_forward.decoder import Decoder from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path @@ -69,9 +72,19 @@ class AlignTTS(nn.Module): hidden_channels=256, hidden_channels_dp=256, encoder_type="fftransformer", - encoder_params={"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}, + encoder_params={ + "hidden_channels_ffn": 1024, + "num_heads": 2, + "num_layers": 6, + "dropout_p": 0.1 + }, decoder_type="fftransformer", - decoder_params={"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}, + decoder_params={ + "hidden_channels_ffn": 1024, + "num_heads": 2, + "num_layers": 6, + "dropout_p": 0.1 + }, length_scale=1, num_speakers=0, external_c=False, @@ -79,11 +92,15 @@ class AlignTTS(nn.Module): ): super().__init__() - self.length_scale = float(length_scale) if isinstance(length_scale, int) else length_scale + self.phase = -1 + self.length_scale = float(length_scale) if isinstance( + length_scale, int) else length_scale self.emb = nn.Embedding(num_chars, hidden_channels) self.pos_encoder = PositionalEncoding(hidden_channels) - self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, encoder_params, c_in_channels) - self.decoder = Decoder(out_channels, hidden_channels, decoder_type, decoder_params) + self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, + encoder_params, c_in_channels) + self.decoder = Decoder(out_channels, hidden_channels, decoder_type, + decoder_params) self.duration_predictor = DurationPredictor(hidden_channels_dp) self.mod_layer = nn.Conv1d(hidden_channels, hidden_channels, 1) @@ -104,9 +121,9 @@ class AlignTTS(nn.Module): mu = mu.transpose(1, 2).unsqueeze(2) # [B, T2, 1, D] log_sigma = log_sigma.transpose(1, 2).unsqueeze(2) # [B, T2, 1, D] expanded_y, expanded_mu = torch.broadcast_tensors(y, mu) - exponential = -0.5 * torch.mean( - torch._C._nn.mse_loss(expanded_y, expanded_mu, 0) / torch.pow(log_sigma.exp(), 2), dim=-1 - ) # B, L, T + exponential = -0.5 * torch.mean(torch._C._nn.mse_loss( + expanded_y, expanded_mu, 0) / torch.pow(log_sigma.exp(), 2), + dim=-1) # B, L, T logp = exponential - 0.5 * log_sigma.mean(dim=-1) return logp @@ -140,7 +157,9 @@ class AlignTTS(nn.Module): [1, 0, 0, 0, 0, 0, 0]] """ attn = self.convert_dr_to_align(dr, x_mask, y_mask) - o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2), en.transpose(1, 2)).transpose(1, 2) + o_en_ex = torch.matmul( + attn.squeeze(1).transpose(1, 2), en.transpose(1, + 2)).transpose(1, 2) return o_en_ex, attn def format_durations(self, o_dr_log, x_mask): @@ -174,7 +193,8 @@ class AlignTTS(nn.Module): x_emb = torch.transpose(x_emb, 1, -1) # compute sequence masks - x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).to(x.dtype) + x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), + 1).to(x.dtype) # encoder pass o_en = self.encoder(x_emb, x_mask) @@ -187,7 +207,8 @@ class AlignTTS(nn.Module): return o_en, o_en_dp, x_mask, g def _forward_decoder(self, o_en, o_en_dp, dr, x_mask, y_lengths, g): - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), + 1).to(o_en_dp.dtype) # expand o_en with durations o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask) # positional encoding @@ -203,11 +224,13 @@ class AlignTTS(nn.Module): def _forward_mdn(self, o_en, y, y_lengths, x_mask): # MAS potentials and alignment mu, log_sigma = self.mdn_block(o_en) - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en.dtype) - dr_mas, logp = self.compute_align_path(mu, log_sigma, y, x_mask, y_mask) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), + 1).to(o_en.dtype) + dr_mas, logp = self.compute_align_path(mu, log_sigma, y, x_mask, + y_mask) return dr_mas, mu, log_sigma, logp - def forward(self, x, x_lengths, y, y_lengths, phase=None, g=None): # pylint: disable=unused-argument + def forward(self, x, x_lengths, y, y_lengths, cond_input={"x_vectors": None}, phase=None): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] @@ -216,47 +239,85 @@ class AlignTTS(nn.Module): dr: [B, T_max] g: [B, C] """ + y = y.transpose(1, 2) + g = cond_input['x_vectors'] if 'x_vectors' in cond_input else None o_de, o_dr_log, dr_mas_log, attn, mu, log_sigma, logp = None, None, None, None, None, None, None if phase == 0: # train encoder and MDN o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) - dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask) - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype) + dr_mas, mu, log_sigma, logp = self._forward_mdn( + o_en, y, y_lengths, x_mask) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), + 1).to(o_en_dp.dtype) attn = self.convert_dr_to_align(dr_mas, x_mask, y_mask) elif phase == 1: # train decoder o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) dr_mas, _, _, _ = self._forward_mdn(o_en, y, y_lengths, x_mask) - o_de, attn = self._forward_decoder(o_en.detach(), o_en_dp.detach(), dr_mas.detach(), x_mask, y_lengths, g=g) + o_de, attn = self._forward_decoder(o_en.detach(), + o_en_dp.detach(), + dr_mas.detach(), + x_mask, + y_lengths, + g=g) elif phase == 2: # train the whole except duration predictor o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) - dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask) - o_de, attn = self._forward_decoder(o_en, o_en_dp, dr_mas, x_mask, y_lengths, g=g) + dr_mas, mu, log_sigma, logp = self._forward_mdn( + o_en, y, y_lengths, x_mask) + o_de, attn = self._forward_decoder(o_en, + o_en_dp, + dr_mas, + x_mask, + y_lengths, + g=g) elif phase == 3: # train duration predictor o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) o_dr_log = self.duration_predictor(x, x_mask) - dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask) - o_de, attn = self._forward_decoder(o_en, o_en_dp, dr_mas, x_mask, y_lengths, g=g) + dr_mas, mu, log_sigma, logp = self._forward_mdn( + o_en, y, y_lengths, x_mask) + o_de, attn = self._forward_decoder(o_en, + o_en_dp, + dr_mas, + x_mask, + y_lengths, + g=g) o_dr_log = o_dr_log.squeeze(1) else: o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) - dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask) - o_de, attn = self._forward_decoder(o_en, o_en_dp, dr_mas, x_mask, y_lengths, g=g) + dr_mas, mu, log_sigma, logp = self._forward_mdn( + o_en, y, y_lengths, x_mask) + o_de, attn = self._forward_decoder(o_en, + o_en_dp, + dr_mas, + x_mask, + y_lengths, + g=g) o_dr_log = o_dr_log.squeeze(1) dr_mas_log = torch.log(dr_mas + 1).squeeze(1) - return o_de, o_dr_log, dr_mas_log, attn, mu, log_sigma, logp + outputs = { + 'model_outputs': o_de.transpose(1, 2), + 'alignments': attn, + 'durations_log': o_dr_log, + 'durations_mas_log': dr_mas_log, + 'mu': mu, + 'log_sigma': log_sigma, + 'logp': logp + } + return outputs @torch.no_grad() - def inference(self, x, x_lengths, g=None): # pylint: disable=unused-argument + def inference(self, x, cond_input={'x_vectors': None}): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] x_lengths: [B] g: [B, C] """ + g = cond_input['x_vectors'] if 'x_vectors' in cond_input else None + x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # pad input to prevent dropping the last word # x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0) o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) @@ -265,14 +326,91 @@ class AlignTTS(nn.Module): # duration predictor pass o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1) y_lengths = o_dr.sum(1) - o_de, attn = self._forward_decoder(o_en, o_en_dp, o_dr, x_mask, y_lengths, g=g) - return o_de, attn + o_de, attn = self._forward_decoder(o_en, + o_en_dp, + o_dr, + x_mask, + y_lengths, + g=g) + outputs = {'model_outputs': o_de.transpose(1, 2), 'alignments': attn} + return outputs - def load_checkpoint( - self, config, checkpoint_path, eval=False - ): # pylint: disable=unused-argument, redefined-builtin + def train_step(self, batch: dict, criterion: nn.Module): + text_input = batch['text_input'] + text_lengths = batch['text_lengths'] + mel_input = batch['mel_input'] + mel_lengths = batch['mel_lengths'] + x_vectors = batch['x_vectors'] + speaker_ids = batch['speaker_ids'] + + cond_input = {'x_vectors': x_vectors, 'speaker_ids': speaker_ids} + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input, self.phase) + loss_dict = criterion( + outputs['logp'], + outputs['model_outputs'], + mel_input, + mel_lengths, + outputs['durations_log'], + outputs['durations_mas_log'], + text_lengths, + phase=self.phase, + ) + + # compute alignment error (the lower the better ) + align_error = 1 - alignment_diagonal_score(outputs['alignments'], + binary=True) + loss_dict["align_error"] = align_error + return outputs, loss_dict + + def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): + model_outputs = outputs['model_outputs'] + alignments = outputs['alignments'] + mel_input = batch['mel_input'] + + pred_spec = model_outputs[0].data.cpu().numpy() + gt_spec = mel_input[0].data.cpu().numpy() + align_img = alignments[0].data.cpu().numpy() + + figures = { + "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), + "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), + "alignment": plot_alignment(align_img, output_fig=False), + } + + # Sample audio + train_audio = ap.inv_melspectrogram(pred_spec.T) + return figures, train_audio + + def eval_step(self, batch: dict, criterion: nn.Module): + return self.train_step(batch, criterion) + + def eval_log(self, ap: AudioProcessor, batch: dict, outputs: dict): + return self.train_log(ap, batch, outputs) + + def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin state = torch.load(checkpoint_path, map_location=torch.device("cpu")) self.load_state_dict(state["model"]) if eval: self.eval() assert not self.training + + @staticmethod + def _set_phase(config, global_step): + """Decide AlignTTS training phase""" + if isinstance(config.phase_start_steps, list): + vals = [i < global_step for i in config.phase_start_steps] + if not True in vals: + phase = 0 + else: + phase = ( + len(config.phase_start_steps) + - [i < global_step for i in config.phase_start_steps][::-1].index(True) + - 1 + ) + else: + phase = None + return phase + + def on_epoch_start(self, trainer): + """Set AlignTTS training phase on epoch start.""" + self.phase = self._set_phase(trainer.config, trainer.total_steps_done) From 0c05318e8dc82014c18682a71124dce9daacfef5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 10:26:31 +0200 Subject: [PATCH 030/258] update tts training tests to use the trainer --- tests/tts_tests/test_align_tts_train.py | 5 +++-- tests/tts_tests/test_glow_tts_train.py | 7 ++++--- tests/tts_tests/test_speedy_speech_train.py | 7 ++++--- tests/tts_tests/test_tacotron2_train.py | 5 +++-- tests/tts_tests/test_tacotron_train.py | 5 +++-- 5 files changed, 17 insertions(+), 12 deletions(-) diff --git a/tests/tts_tests/test_align_tts_train.py b/tests/tts_tests/test_align_tts_train.py index 848f46c1..4bf3802f 100644 --- a/tests/tts_tests/test_align_tts_train.py +++ b/tests/tts_tests/test_align_tts_train.py @@ -30,12 +30,13 @@ config.save_json(config_path) # train the model for one epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_align_tts.py --config_path {config_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs -1" ) run_cli(command_train) @@ -44,7 +45,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_align_tts.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py index 2e675d13..6df25baa 100644 --- a/tests/tts_tests/test_glow_tts_train.py +++ b/tests/tts_tests/test_glow_tts_train.py @@ -30,13 +30,14 @@ config.save_json(config_path) # train the model for one epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_glow_tts.py --config_path {config_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt" + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" ) run_cli(command_train) @@ -45,7 +46,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_glow_tts.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index 3f508117..e4413438 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -30,13 +30,14 @@ config.save_json(config_path) # train the model for one epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_speedy_speech.py --config_path {config_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt" + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" ) run_cli(command_train) @@ -45,7 +46,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_speedy_speech.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py index dbec309b..aef507a5 100644 --- a/tests/tts_tests/test_tacotron2_train.py +++ b/tests/tts_tests/test_tacotron2_train.py @@ -31,12 +31,13 @@ config.save_json(config_path) # train the model for one epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tacotron.py --config_path {config_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " ) run_cli(command_train) @@ -45,7 +46,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tacotron.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_tacotron_train.py b/tests/tts_tests/test_tacotron_train.py index 34ee6e06..771ad93c 100644 --- a/tests/tts_tests/test_tacotron_train.py +++ b/tests/tts_tests/test_tacotron_train.py @@ -30,12 +30,13 @@ config.save_json(config_path) # train the model for one epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tacotron.py --config_path {config_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0" ) run_cli(command_train) @@ -44,7 +45,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tacotron.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) From fd6afe5ae5cb177a0310e928495fa96eee901293 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 10:47:24 +0200 Subject: [PATCH 031/258] update `setup_model.py` imports --- TTS/bin/compute_attention_masks.py | 2 +- TTS/bin/convert_tacotron2_torch_to_tf.py | 2 +- TTS/bin/extract_tts_spectrograms.py | 2 +- TTS/bin/train_align_tts.py | 2 +- TTS/bin/train_glow_tts.py | 2 +- TTS/bin/train_speedy_speech.py | 2 +- TTS/bin/train_tacotron.py | 2 +- TTS/utils/synthesizer.py | 2 +- tests/inference_tests/test_synthesizer.py | 2 +- tests/test_extract_tts_spectrograms.py | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index e14ff433..3cbf40ba 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -9,7 +9,7 @@ from torch.utils.data import DataLoader from tqdm import tqdm from TTS.tts.datasets.TTSDataset import TTSDataset -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.models import setup_model from TTS.tts.utils.io import load_checkpoint from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.utils.audio import AudioProcessor diff --git a/TTS/bin/convert_tacotron2_torch_to_tf.py b/TTS/bin/convert_tacotron2_torch_to_tf.py index d523d01e..e7f991be 100644 --- a/TTS/bin/convert_tacotron2_torch_to_tf.py +++ b/TTS/bin/convert_tacotron2_torch_to_tf.py @@ -11,7 +11,7 @@ import torch from TTS.tts.tf.models.tacotron2 import Tacotron2 from TTS.tts.tf.utils.convert_torch_to_tf_utils import compare_torch_tf, convert_tf_name, transfer_weights_torch_to_tf from TTS.tts.tf.utils.generic_utils import save_checkpoint -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.models import setup_model from TTS.tts.utils.text.symbols import phonemes, symbols from TTS.utils.io import load_config diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index e8814a11..6845c11f 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -12,7 +12,7 @@ from tqdm import tqdm from TTS.config import load_config from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.models import setup_model from TTS.tts.utils.speakers import parse_speakers from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.utils.audio import AudioProcessor diff --git a/TTS/bin/train_align_tts.py b/TTS/bin/train_align_tts.py index d231484a..b94d7a65 100644 --- a/TTS/bin/train_align_tts.py +++ b/TTS/bin/train_align_tts.py @@ -16,7 +16,7 @@ from torch.utils.data.distributed import DistributedSampler from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import AlignTTSLoss -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.models import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.speakers import parse_speakers diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index 9a455a1b..9df185ee 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -17,7 +17,7 @@ from torch.utils.data.distributed import DistributedSampler from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import GlowTTSLoss -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.models import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.speakers import parse_speakers diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py index 742a27d8..57ff4272 100644 --- a/TTS/bin/train_speedy_speech.py +++ b/TTS/bin/train_speedy_speech.py @@ -18,7 +18,7 @@ from torch.utils.data.distributed import DistributedSampler from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import SpeedySpeechLoss -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.models import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.speakers import parse_speakers diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index b5e38b80..f833ffc6 100755 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -14,7 +14,7 @@ from torch.utils.data import DataLoader from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import TacotronLoss -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.models import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.speakers import parse_speakers diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 5962950f..8af95a12 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -6,7 +6,7 @@ import pysbd import torch from TTS.config import load_config -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.models import setup_model from TTS.tts.utils.speakers import SpeakerManager # pylint: disable=unused-wildcard-import diff --git a/tests/inference_tests/test_synthesizer.py b/tests/inference_tests/test_synthesizer.py index a1cd4de5..b0fa22d3 100644 --- a/tests/inference_tests/test_synthesizer.py +++ b/tests/inference_tests/test_synthesizer.py @@ -3,7 +3,7 @@ import unittest from tests import get_tests_output_path from TTS.config import load_config -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.models import setup_model from TTS.tts.utils.io import save_checkpoint from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.utils.synthesizer import Synthesizer diff --git a/tests/test_extract_tts_spectrograms.py b/tests/test_extract_tts_spectrograms.py index 38cee473..ddc7e4da 100644 --- a/tests/test_extract_tts_spectrograms.py +++ b/tests/test_extract_tts_spectrograms.py @@ -5,7 +5,7 @@ import torch from tests import get_tests_input_path, get_tests_output_path, run_cli from TTS.config import load_config -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.models import setup_model from TTS.tts.utils.text.symbols import phonemes, symbols torch.manual_seed(1) From c7ff175592982de618a70c5fa35ee2849db0ca1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 11:38:46 +0200 Subject: [PATCH 032/258] revert logging.info to print statements for trainer --- TTS/trainer.py | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index 6087f1bc..63b9cd42 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -150,7 +150,7 @@ class TrainerTTS: # count model size num_params = count_parameters(self.model) - logging.info("\n > Model has {} parameters".format(num_params)) + print("\n > Model has {} parameters".format(num_params)) @staticmethod def get_model(num_chars: int, num_speakers: int, config: Coqpit, @@ -186,7 +186,6 @@ class TrainerTTS: out_path: str = "", data_train: List = []) -> SpeakerManager: speaker_manager = SpeakerManager() - if config.use_speaker_embedding: if restore_path: speakers_file = os.path.join(os.path.dirname(restore_path), "speaker.json") @@ -196,16 +195,6 @@ class TrainerTTS: ) speakers_file = config.external_speaker_embedding_file - if config.use_external_speaker_embedding_file: - speaker_manager.load_x_vectors_file(speakers_file) - else: - speaker_manager.load_ids_file(speakers_file) - elif config.use_external_speaker_embedding_file and config.external_speaker_embedding_file: - speaker_manager.load_x_vectors_file( - config.external_speaker_embedding_file) - else: - speaker_manager.parse_speakers_from_items(data_train) - file_path = os.path.join(out_path, "speakers.json") speaker_manager.save_ids_file(file_path) return speaker_manager @@ -238,15 +227,15 @@ class TrainerTTS: print(" > Restoring from %s ..." % os.path.basename(restore_path)) checkpoint = torch.load(restore_path) try: - logging.info(" > Restoring Model...") + print(" > Restoring Model...") model.load_state_dict(checkpoint["model"]) - logging.info(" > Restoring Optimizer...") + print(" > Restoring Optimizer...") optimizer.load_state_dict(checkpoint["optimizer"]) if "scaler" in checkpoint and config.mixed_precision: - logging.info(" > Restoring AMP Scaler...") + print(" > Restoring AMP Scaler...") scaler.load_state_dict(checkpoint["scaler"]) except (KeyError, RuntimeError): - logging.info(" > Partial model initialization...") + print(" > Partial model initialization...") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint["model"], config) model.load_state_dict(model_dict) From f0a419546b2936d9fca0dc383b91ab181515e432 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 11:39:34 +0200 Subject: [PATCH 033/258] fix `Synthesized` for the new `synthesis()` --- TTS/tts/utils/synthesis.py | 36 ------------------------------------ TTS/utils/synthesizer.py | 6 ++++-- 2 files changed, 4 insertions(+), 38 deletions(-) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 67432320..93d023cb 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -77,42 +77,6 @@ def run_model_torch(model, 'x_vector': x_vector, 'style_mel': style_mel }) - # elif "glow" in CONFIG.model.lower(): - # inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable - # if hasattr(model, "module"): - # # distributed model - # postnet_output, _, _, _, alignments, _, _ = model.module.inference( - # inputs, - # inputs_lengths, - # g=speaker_id if speaker_id is not None else speaker_embeddings) - # else: - # postnet_output, _, _, _, alignments, _, _ = model.inference( - # inputs, - # inputs_lengths, - # g=speaker_id if speaker_id is not None else speaker_embeddings) - # postnet_output = postnet_output.permute(0, 2, 1) - # # these only belong to tacotron models. - # decoder_output = None - # stop_tokens = None - # elif CONFIG.model.lower() in ["speedy_speech", "align_tts"]: - # inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable - # if hasattr(model, "module"): - # # distributed model - # postnet_output, alignments = model.module.inference( - # inputs, - # inputs_lengths, - # g=speaker_id if speaker_id is not None else speaker_embeddings) - # else: - # postnet_output, alignments = model.inference( - # inputs, - # inputs_lengths, - # g=speaker_id if speaker_id is not None else speaker_embeddings) - # postnet_output = postnet_output.permute(0, 2, 1) - # # these only belong to tacotron models. - # decoder_output = None - # stop_tokens = None - # else: - # raise ValueError("[!] Unknown model name.") return outputs diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 8af95a12..a8332eb8 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -222,7 +222,7 @@ class Synthesizer(object): for sen in sens: # synthesize voice - waveform, _, _, mel_postnet_spec, _, _ = synthesis( + outputs = synthesis( model=self.tts_model, text=sen, CONFIG=self.tts_config, @@ -232,8 +232,10 @@ class Synthesizer(object): style_wav=style_wav, enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars, use_griffin_lim=use_gl, - speaker_embedding=speaker_embedding, + x_vector=speaker_embedding, ) + waveform = outputs['wav'] + mel_postnet_spec = outputs['model_outputs'] if not use_gl: # denormalize tts output based on tts audio config mel_postnet_spec = self.ap.denormalize(mel_postnet_spec.T).T From c673eb8ef8da78c5caaf2f03607d462809610321 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 15:18:10 +0200 Subject: [PATCH 034/258] correct import of `load_meta_data` remove redundant import --- TTS/bin/compute_embeddings.py | 2 +- TTS/bin/compute_statistics.py | 2 +- TTS/bin/extract_tts_spectrograms.py | 2 +- TTS/bin/train_align_tts.py | 2 +- TTS/bin/train_encoder.py | 2 +- TTS/bin/train_glow_tts.py | 2 +- TTS/bin/train_speedy_speech.py | 2 +- TTS/bin/train_tacotron.py | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 872fc875..885d66b3 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -7,7 +7,7 @@ from tqdm import tqdm from TTS.config import BaseDatasetConfig, load_config from TTS.speaker_encoder.utils.generic_utils import setup_model -from TTS.tts.datasets.preprocess import load_meta_data +from TTS.tts.datasets import load_meta_data from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index f3234c2a..25e3fce5 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -10,7 +10,7 @@ from tqdm import tqdm # from TTS.utils.io import load_config from TTS.config import load_config -from TTS.tts.datasets.preprocess import load_meta_data +from TTS.tts.datasets import load_meta_data from TTS.utils.audio import AudioProcessor diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 6845c11f..408f334e 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -10,7 +10,7 @@ from torch.utils.data import DataLoader from tqdm import tqdm from TTS.config import load_config -from TTS.tts.datasets.preprocess import load_meta_data +from TTS.tts.datasets import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.models import setup_model from TTS.tts.utils.speakers import parse_speakers diff --git a/TTS/bin/train_align_tts.py b/TTS/bin/train_align_tts.py index b94d7a65..34eba7a8 100644 --- a/TTS/bin/train_align_tts.py +++ b/TTS/bin/train_align_tts.py @@ -13,7 +13,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler -from TTS.tts.datasets.preprocess import load_meta_data +from TTS.tts.datasets import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import AlignTTSLoss from TTS.tts.models import setup_model diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 48309dc9..6e4a9b32 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -13,7 +13,7 @@ from TTS.speaker_encoder.dataset import SpeakerEncoderDataset from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_model from TTS.speaker_encoder.utils.visual import plot_embeddings -from TTS.tts.datasets.preprocess import load_meta_data +from TTS.tts.datasets import load_meta_data from TTS.utils.arguments import init_training from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import count_parameters, remove_experiment_folder, set_init_dict diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index 9df185ee..a138abeb 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -14,7 +14,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler -from TTS.tts.datasets.preprocess import load_meta_data +from TTS.tts.datasets import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import GlowTTSLoss from TTS.tts.models import setup_model diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py index 57ff4272..4dc3f5f0 100644 --- a/TTS/bin/train_speedy_speech.py +++ b/TTS/bin/train_speedy_speech.py @@ -15,7 +15,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler -from TTS.tts.datasets.preprocess import load_meta_data +from TTS.tts.datasets import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import SpeedySpeechLoss from TTS.tts.models import setup_model diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index f833ffc6..69ffbb6c 100755 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -11,7 +11,7 @@ import numpy as np import torch from torch.utils.data import DataLoader -from TTS.tts.datasets.preprocess import load_meta_data +from TTS.tts.datasets import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import TacotronLoss from TTS.tts.models import setup_model From 830306d2fd81eb93db5fcc57daf4d628f8d40281 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 15:18:36 +0200 Subject: [PATCH 035/258] update `extract_tts_spectrograms` for the new model API --- TTS/bin/extract_tts_spectrograms.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 408f334e..deac7fc5 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -146,20 +146,22 @@ def inference( elif speaker_embeddings is not None: speaker_c = speaker_embeddings - model_output, *_ = model.inference_with_MAS( + outputs = model.inference_with_MAS( text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_c ) + model_output = outputs['model_outputs'] model_output = model_output.transpose(1, 2).detach().cpu().numpy() elif "tacotron" in model_name: - _, postnet_outputs, *_ = model( + cond_input = {'speaker_ids': speaker_ids, 'x_vectors': speaker_embeddings} + outputs = model( text_input, text_lengths, mel_input, mel_lengths, - speaker_ids=speaker_ids, - speaker_embeddings=speaker_embeddings, + cond_input ) + postnet_outputs = outputs['model_outputs'] # normalize tacotron output if model_name == "tacotron": mel_specs = [] From 667bb708b6d9a4bf4f7dd99c872c865b58b034db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 1 Jun 2021 10:49:48 +0200 Subject: [PATCH 036/258] update `extract_tts_spec...` using `SpeakerManager` --- TTS/bin/extract_tts_spectrograms.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index deac7fc5..26a4b2f4 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -13,7 +13,7 @@ from TTS.config import load_config from TTS.tts.datasets import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.models import setup_model -from TTS.tts.utils.speakers import parse_speakers +from TTS.tts.utils.speakers import get_speaker_manager from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import count_parameters @@ -39,7 +39,9 @@ def setup_loader(ap, r, verbose=False): enable_eos_bos=c.enable_eos_bos_chars, use_noise_augment=False, verbose=verbose, - speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None, + speaker_mapping=speaker_manager.speaker_ids + if c.use_speaker_embedding and c.use_external_speaker_embedding_file + else None, ) if c.use_phonemes and c.compute_input_seq_cache: @@ -91,7 +93,7 @@ def format_data(data): speaker_embeddings = data[8] speaker_ids = None else: - speaker_ids = [speaker_mapping[speaker_name] for speaker_name in speaker_names] + speaker_ids = [speaker_manager.speaker_ids[speaker_name] for speaker_name in speaker_names] speaker_ids = torch.LongTensor(speaker_ids) speaker_embeddings = None else: @@ -134,12 +136,11 @@ def inference( text_lengths, mel_input, mel_lengths, - attn_mask=None, speaker_ids=None, speaker_embeddings=None, ): if model_name == "glow_tts": - mel_input = mel_input.permute(0, 2, 1) # B x D x T + # mel_input = mel_input.permute(0, 2, 1) # B x D x T speaker_c = None if speaker_ids is not None: speaker_c = speaker_ids @@ -147,9 +148,9 @@ def inference( speaker_c = speaker_embeddings outputs = model.inference_with_MAS( - text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_c + text_input, text_lengths, mel_input, mel_lengths, cond_input={"x_vectors": speaker_c} ) - model_output = outputs['model_outputs'] + model_output = outputs["model_outputs"] model_output = model_output.transpose(1, 2).detach().cpu().numpy() elif "tacotron" in model_name: @@ -193,7 +194,7 @@ def extract_spectrograms( speaker_embeddings, _, _, - attn_mask, + _, item_idx, ) = format_data(data) @@ -205,7 +206,6 @@ def extract_spectrograms( text_lengths, mel_input, mel_lengths, - attn_mask, speaker_ids, speaker_embeddings, ) @@ -242,7 +242,7 @@ def extract_spectrograms( def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined - global meta_data, symbols, phonemes, model_characters, speaker_mapping + global meta_data, symbols, phonemes, model_characters, speaker_manager # Audio processor ap = AudioProcessor(**c.audio) @@ -260,10 +260,10 @@ def main(args): # pylint: disable=redefined-outer-name meta_data = meta_data_train + meta_data_eval # parse speakers - num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(c, args, meta_data_train, None) + speaker_manager = get_speaker_manager(c, args, meta_data_train) # setup model - model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim=speaker_embedding_dim) + model = setup_model(num_chars, speaker_manager.num_speakers, c, speaker_embedding_dim=speaker_manager.x_vector_dim) # restore model checkpoint = torch.load(args.checkpoint_path, map_location="cpu") From 88d8a94a104e9577d8988c23e84b67488bf53149 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 31 May 2021 15:35:26 +0200 Subject: [PATCH 037/258] update extract_tts_spectrogram for `cond_input` API of the models --- TTS/bin/extract_tts_spectrograms.py | 1 - 1 file changed, 1 deletion(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 26a4b2f4..95171a9b 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -140,7 +140,6 @@ def inference( speaker_embeddings=None, ): if model_name == "glow_tts": - # mel_input = mel_input.permute(0, 2, 1) # B x D x T speaker_c = None if speaker_ids is not None: speaker_c = speaker_ids From aefa71155cf72a9e1fd37df6cb6e96421f805c77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 17:25:00 +0200 Subject: [PATCH 038/258] make style --- TTS/bin/extract_tts_spectrograms.py | 12 +- TTS/bin/train_tts.py | 9 +- TTS/trainer.py | 140 ++++------- TTS/tts/configs/align_tts_config.py | 21 +- TTS/tts/configs/glow_tts_config.py | 4 +- TTS/tts/configs/shared_configs.py | 4 +- TTS/tts/configs/speedy_speech_config.py | 18 +- TTS/tts/configs/tacotron_config.py | 20 +- TTS/tts/datasets/__init__.py | 10 +- TTS/tts/layers/losses.py | 7 +- TTS/tts/models/align_tts.py | 166 +++++-------- TTS/tts/models/glow_tts.py | 186 ++++++--------- TTS/tts/models/speedy_speech.py | 126 ++++------ TTS/tts/models/tacotron.py | 167 ++++++------- TTS/tts/models/tacotron2.py | 251 ++++++++++---------- TTS/tts/models/tacotron_abstract.py | 2 +- TTS/tts/utils/synthesis.py | 64 ++--- TTS/utils/arguments.py | 39 +-- TTS/utils/synthesizer.py | 4 +- tests/tts_tests/test_align_tts_train.py | 4 +- tests/tts_tests/test_glow_tts_train.py | 4 +- tests/tts_tests/test_speedy_speech_train.py | 4 +- tests/tts_tests/test_tacotron2_train.py | 4 +- tests/tts_tests/test_tacotron_train.py | 4 +- tests/vocoder_tests/test_melgan_train.py | 1 - 25 files changed, 524 insertions(+), 747 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 95171a9b..64abc719 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -153,15 +153,9 @@ def inference( model_output = model_output.transpose(1, 2).detach().cpu().numpy() elif "tacotron" in model_name: - cond_input = {'speaker_ids': speaker_ids, 'x_vectors': speaker_embeddings} - outputs = model( - text_input, - text_lengths, - mel_input, - mel_lengths, - cond_input - ) - postnet_outputs = outputs['model_outputs'] + cond_input = {"speaker_ids": speaker_ids, "x_vectors": speaker_embeddings} + outputs = model(text_input, text_lengths, mel_input, mel_lengths, cond_input) + postnet_outputs = outputs["model_outputs"] # normalize tacotron output if model_name == "tacotron": mel_specs = [] diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 7cc8a25f..607a4e3b 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -8,13 +8,8 @@ from TTS.trainer import TrainerTTS def main(): # try: - args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training( - sys.argv) - trainer = TrainerTTS(args, - config, - c_logger, - tb_logger, - output_path=OUT_PATH) + args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) + trainer = TrainerTTS(args, config, c_logger, tb_logger, output_path=OUT_PATH) trainer.fit() # except KeyboardInterrupt: # remove_experiment_folder(OUT_PATH) diff --git a/TTS/trainer.py b/TTS/trainer.py index 63b9cd42..06d5d6b5 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -84,7 +84,7 @@ class TrainerTTS: self.best_loss = float("inf") self.train_loader = None self.eval_loader = None - self.output_audio_path = os.path.join(output_path, 'test_audios') + self.output_audio_path = os.path.join(output_path, "test_audios") self.keep_avg_train = None self.keep_avg_eval = None @@ -138,8 +138,8 @@ class TrainerTTS: if self.args.restore_path: self.model, self.optimizer, self.scaler, self.restore_step = self.restore_model( - self.config, args.restore_path, self.model, self.optimizer, - self.scaler) + self.config, args.restore_path, self.model, self.optimizer, self.scaler + ) # setup scheduler self.scheduler = self.get_scheduler(self.config, self.optimizer) @@ -207,6 +207,7 @@ class TrainerTTS: return None if lr_scheduler.lower() == "noamlr": from TTS.utils.training import NoamLR + scheduler = NoamLR else: scheduler = getattr(torch.optim, lr_scheduler) @@ -261,8 +262,7 @@ class TrainerTTS: ap=ap, tp=self.config.characters, add_blank=self.config["add_blank"], - batch_group_size=0 if is_eval else - self.config.batch_group_size * self.config.batch_size, + batch_group_size=0 if is_eval else self.config.batch_group_size * self.config.batch_size, min_seq_len=self.config.min_seq_len, max_seq_len=self.config.max_seq_len, phoneme_cache_path=self.config.phoneme_cache_path, @@ -272,8 +272,8 @@ class TrainerTTS: use_noise_augment=not is_eval, verbose=verbose, speaker_mapping=speaker_mapping - if self.config.use_speaker_embedding - and self.config.use_external_speaker_embedding_file else None, + if self.config.use_speaker_embedding and self.config.use_external_speaker_embedding_file + else None, ) if self.config.use_phonemes and self.config.compute_input_seq_cache: @@ -281,18 +281,15 @@ class TrainerTTS: dataset.compute_input_seq(self.config.num_loader_workers) dataset.sort_items() - sampler = DistributedSampler( - dataset) if self.num_gpus > 1 else None + sampler = DistributedSampler(dataset) if self.num_gpus > 1 else None loader = DataLoader( dataset, - batch_size=self.config.eval_batch_size - if is_eval else self.config.batch_size, + batch_size=self.config.eval_batch_size if is_eval else self.config.batch_size, shuffle=False, collate_fn=dataset.collate_fn, drop_last=False, sampler=sampler, - num_workers=self.config.num_val_loader_workers - if is_eval else self.config.num_loader_workers, + num_workers=self.config.num_val_loader_workers if is_eval else self.config.num_loader_workers, pin_memory=False, ) return loader @@ -314,8 +311,7 @@ class TrainerTTS: text_input = batch[0] text_lengths = batch[1] speaker_names = batch[2] - linear_input = batch[3] if self.config.model.lower() in ["tacotron" - ] else None + linear_input = batch[3] if self.config.model.lower() in ["tacotron"] else None mel_input = batch[4] mel_lengths = batch[5] stop_targets = batch[6] @@ -331,10 +327,7 @@ class TrainerTTS: speaker_embeddings = batch[8] speaker_ids = None else: - speaker_ids = [ - self.speaker_manager.speaker_ids[speaker_name] - for speaker_name in speaker_names - ] + speaker_ids = [self.speaker_manager.speaker_ids[speaker_name] for speaker_name in speaker_names] speaker_ids = torch.LongTensor(speaker_ids) speaker_embeddings = None else: @@ -346,7 +339,7 @@ class TrainerTTS: durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2]) for idx, am in enumerate(attn_mask): # compute raw durations - c_idxs = am[:, :text_lengths[idx], :mel_lengths[idx]].max(1)[1] + c_idxs = am[:, : text_lengths[idx], : mel_lengths[idx]].max(1)[1] # c_idxs, counts = torch.unique_consecutive(c_idxs, return_counts=True) c_idxs, counts = torch.unique(c_idxs, return_counts=True) dur = torch.ones([text_lengths[idx]]).to(counts.dtype) @@ -359,14 +352,11 @@ class TrainerTTS: assert ( dur.sum() == mel_lengths[idx] ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" - durations[idx, :text_lengths[idx]] = dur + durations[idx, : text_lengths[idx]] = dur # set stop targets view, we predict a single stop token per iteration. - stop_targets = stop_targets.view(text_input.shape[0], - stop_targets.size(1) // self.config.r, - -1) - stop_targets = (stop_targets.sum(2) > - 0.0).unsqueeze(2).float().squeeze(2) + stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // self.config.r, -1) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) # dispatch batch to GPU if self.use_cuda: @@ -374,15 +364,10 @@ class TrainerTTS: text_lengths = text_lengths.cuda(non_blocking=True) mel_input = mel_input.cuda(non_blocking=True) mel_lengths = mel_lengths.cuda(non_blocking=True) - linear_input = linear_input.cuda( - non_blocking=True) if self.config.model.lower() in [ - "tacotron" - ] else None + linear_input = linear_input.cuda(non_blocking=True) if self.config.model.lower() in ["tacotron"] else None stop_targets = stop_targets.cuda(non_blocking=True) - attn_mask = attn_mask.cuda( - non_blocking=True) if attn_mask is not None else None - durations = durations.cuda( - non_blocking=True) if attn_mask is not None else None + attn_mask = attn_mask.cuda(non_blocking=True) if attn_mask is not None else None + durations = durations.cuda(non_blocking=True) if attn_mask is not None else None if speaker_ids is not None: speaker_ids = speaker_ids.cuda(non_blocking=True) if speaker_embeddings is not None: @@ -401,7 +386,7 @@ class TrainerTTS: "x_vectors": speaker_embeddings, "max_text_length": max_text_length, "max_spec_length": max_spec_length, - "item_idx": item_idx + "item_idx": item_idx, } def train_step(self, batch: Dict, batch_n_steps: int, step: int, @@ -421,25 +406,20 @@ class TrainerTTS: # check nan loss if torch.isnan(loss_dict["loss"]).any(): - raise RuntimeError( - f"Detected NaN loss at step {self.total_steps_done}.") + raise RuntimeError(f"Detected NaN loss at step {self.total_steps_done}.") # optimizer step if self.config.mixed_precision: # model optimizer step in mixed precision mode self.scaler.scale(loss_dict["loss"]).backward() self.scaler.unscale_(self.optimizer) - grad_norm, _ = check_update(self.model, - self.config.grad_clip, - ignore_stopnet=True) + grad_norm, _ = check_update(self.model, self.config.grad_clip, ignore_stopnet=True) self.scaler.step(self.optimizer) self.scaler.update() else: # main model optimizer step loss_dict["loss"].backward() - grad_norm, _ = check_update(self.model, - self.config.grad_clip, - ignore_stopnet=True) + grad_norm, _ = check_update(self.model, self.config.grad_clip, ignore_stopnet=True) self.optimizer.step() step_time = time.time() - step_start_time @@ -469,17 +449,15 @@ class TrainerTTS: current_lr = self.optimizer.param_groups[0]["lr"] if self.total_steps_done % self.config.print_step == 0: log_dict = { - "max_spec_length": [batch["max_spec_length"], - 1], # value, precision + "max_spec_length": [batch["max_spec_length"], 1], # value, precision "max_text_length": [batch["max_text_length"], 1], "step_time": [step_time, 4], "loader_time": [loader_time, 2], "current_lr": current_lr, } - self.c_logger.print_train_step(batch_n_steps, step, - self.total_steps_done, log_dict, - loss_dict, - self.keep_avg_train.avg_values) + self.c_logger.print_train_step( + batch_n_steps, step, self.total_steps_done, log_dict, loss_dict, self.keep_avg_train.avg_values + ) if self.args.rank == 0: # Plot Training Iter Stats @@ -491,8 +469,7 @@ class TrainerTTS: "step_time": step_time, } iter_stats.update(loss_dict) - self.tb_logger.tb_train_step_stats(self.total_steps_done, - iter_stats) + self.tb_logger.tb_train_step_stats(self.total_steps_done, iter_stats) if self.total_steps_done % self.config.save_step == 0: if self.config.checkpoint: @@ -506,15 +483,12 @@ class TrainerTTS: self.output_path, model_loss=loss_dict["loss"], characters=self.model_characters, - scaler=self.scaler.state_dict() - if self.config.mixed_precision else None, + scaler=self.scaler.state_dict() if self.config.mixed_precision else None, ) # training visualizations figures, audios = self.model.train_log(self.ap, batch, outputs) self.tb_logger.tb_train_figures(self.total_steps_done, figures) - self.tb_logger.tb_train_audios(self.total_steps_done, - {"TrainAudio": audios}, - self.ap.sample_rate) + self.tb_logger.tb_train_audios(self.total_steps_done, {"TrainAudio": audios}, self.ap.sample_rate) self.total_steps_done += 1 self.on_train_step_end() return outputs, loss_dict @@ -523,35 +497,28 @@ class TrainerTTS: self.model.train() epoch_start_time = time.time() if self.use_cuda: - batch_num_steps = int( - len(self.train_loader.dataset) / - (self.config.batch_size * self.num_gpus)) + batch_num_steps = int(len(self.train_loader.dataset) / (self.config.batch_size * self.num_gpus)) else: - batch_num_steps = int( - len(self.train_loader.dataset) / self.config.batch_size) + batch_num_steps = int(len(self.train_loader.dataset) / self.config.batch_size) self.c_logger.print_train_start() loader_start_time = time.time() for cur_step, batch in enumerate(self.train_loader): - _, _ = self.train_step(batch, batch_num_steps, cur_step, - loader_start_time) + _, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time) epoch_time = time.time() - epoch_start_time # Plot self.epochs_done Stats if self.args.rank == 0: epoch_stats = {"epoch_time": epoch_time} epoch_stats.update(self.keep_avg_train.avg_values) - self.tb_logger.tb_train_epoch_stats(self.total_steps_done, - epoch_stats) + self.tb_logger.tb_train_epoch_stats(self.total_steps_done, epoch_stats) if self.config.tb_model_param_stats: - self.tb_logger.tb_model_weights(self.model, - self.total_steps_done) + self.tb_logger.tb_model_weights(self.model, self.total_steps_done) def eval_step(self, batch: Dict, step: int) -> Tuple[Dict, Dict]: with torch.no_grad(): step_start_time = time.time() with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): - outputs, loss_dict = self.model.eval_step( - batch, self.criterion) + outputs, loss_dict = self.model.eval_step(batch, self.criterion) step_time = time.time() - step_start_time @@ -572,8 +539,7 @@ class TrainerTTS: self.keep_avg_eval.update_values(update_eval_values) if self.config.print_eval: - self.c_logger.print_eval_step(step, loss_dict, - self.keep_avg_eval.avg_values) + self.c_logger.print_eval_step(step, loss_dict, self.keep_avg_eval.avg_values) return outputs, loss_dict def eval_epoch(self) -> None: @@ -585,15 +551,13 @@ class TrainerTTS: # format data batch = self.format_batch(batch) loader_time = time.time() - loader_start_time - self.keep_avg_eval.update_values({'avg_loader_time': loader_time}) + self.keep_avg_eval.update_values({"avg_loader_time": loader_time}) outputs, _ = self.eval_step(batch, cur_step) # Plot epoch stats and samples from the last batch. if self.args.rank == 0: figures, eval_audios = self.model.eval_log(self.ap, batch, outputs) self.tb_logger.tb_eval_figures(self.total_steps_done, figures) - self.tb_logger.tb_eval_audios(self.total_steps_done, - {"EvalAudio": eval_audios}, - self.ap.sample_rate) + self.tb_logger.tb_eval_audios(self.total_steps_done, {"EvalAudio": eval_audios}, self.ap.sample_rate) def test_run(self, ) -> None: print(" | > Synthesizing test sentences.") @@ -608,9 +572,9 @@ class TrainerTTS: self.config, self.use_cuda, self.ap, - speaker_id=cond_inputs['speaker_id'], - x_vector=cond_inputs['x_vector'], - style_wav=cond_inputs['style_wav'], + speaker_id=cond_inputs["speaker_id"], + x_vector=cond_inputs["x_vector"], + style_wav=cond_inputs["style_wav"], enable_eos_bos_chars=self.config.enable_eos_bos_chars, use_griffin_lim=True, do_trim_silence=False, @@ -623,10 +587,8 @@ class TrainerTTS: "TestSentence_{}.wav".format(idx)) self.ap.save_wav(wav, file_path) test_audios["{}-audio".format(idx)] = wav - test_figures["{}-prediction".format(idx)] = plot_spectrogram( - model_outputs, self.ap, output_fig=False) - test_figures["{}-alignment".format(idx)] = plot_alignment( - alignment, output_fig=False) + test_figures["{}-prediction".format(idx)] = plot_spectrogram(model_outputs, self.ap, output_fig=False) + test_figures["{}-alignment".format(idx)] = plot_alignment(alignment, output_fig=False) self.tb_logger.tb_test_audios(self.total_steps_done, test_audios, self.config.audio["sample_rate"]) @@ -641,11 +603,11 @@ class TrainerTTS: if self.config.use_external_speaker_embedding_file and self.config.use_speaker_embedding else None) # setup style_mel - if self.config.has('gst_style_input'): + if self.config.has("gst_style_input"): style_wav = self.config.gst_style_input else: style_wav = None - if style_wav is None and 'use_gst' in self.config and self.config.use_gst: + if style_wav is None and "use_gst" in self.config and self.config.use_gst: # inicialize GST with zero dict. style_wav = {} print( @@ -688,8 +650,7 @@ class TrainerTTS: for epoch in range(0, self.config.epochs): self.on_epoch_start() self.keep_avg_train = KeepAverage() - self.keep_avg_eval = KeepAverage( - ) if self.config.run_eval else None + self.keep_avg_eval = KeepAverage() if self.config.run_eval else None self.epochs_done = epoch self.c_logger.print_epoch_start(epoch, self.config.epochs) self.train_epoch() @@ -698,8 +659,8 @@ class TrainerTTS: if epoch >= self.config.test_delay_epochs: self.test_run() self.c_logger.print_epoch_end( - epoch, self.keep_avg_eval.avg_values - if self.config.run_eval else self.keep_avg_train.avg_values) + epoch, self.keep_avg_eval.avg_values if self.config.run_eval else self.keep_avg_train.avg_values + ) self.save_best_model() self.on_epoch_end() @@ -717,8 +678,7 @@ class TrainerTTS: self.model_characters, keep_all_best=self.config.keep_all_best, keep_after=self.config.keep_after, - scaler=self.scaler.state_dict() - if self.config.mixed_precision else None, + scaler=self.scaler.state_dict() if self.config.mixed_precision else None, ) @staticmethod diff --git a/TTS/tts/configs/align_tts_config.py b/TTS/tts/configs/align_tts_config.py index 115e969c..56622741 100644 --- a/TTS/tts/configs/align_tts_config.py +++ b/TTS/tts/configs/align_tts_config.py @@ -93,7 +93,7 @@ class AlignTTSConfig(BaseTTSConfig): # optimizer parameters optimizer: str = "Adam" - optimizer_params: dict = field(default_factory=lambda: {'betas': [0.9, 0.998], 'weight_decay': 1e-6}) + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6}) lr_scheduler: str = None lr_scheduler_params: dict = None lr: float = 1e-4 @@ -104,12 +104,13 @@ class AlignTTSConfig(BaseTTSConfig): max_seq_len: int = 200 r: int = 1 - # testing - test_sentences: List[str] = field(default_factory=lambda:[ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963." - ]) - + # testing + test_sentences: List[str] = field( + default_factory=lambda: [ + "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + "Be a voice, not an echo.", + "I'm sorry Dave. I'm afraid I can't do that.", + "This cake is great. It's so delicious and moist.", + "Prior to November 22, 1963.", + ] + ) diff --git a/TTS/tts/configs/glow_tts_config.py b/TTS/tts/configs/glow_tts_config.py index 214b2377..925854c9 100644 --- a/TTS/tts/configs/glow_tts_config.py +++ b/TTS/tts/configs/glow_tts_config.py @@ -91,9 +91,9 @@ class GlowTTSConfig(BaseTTSConfig): # optimizer parameters optimizer: str = "RAdam" - optimizer_params: dict = field(default_factory=lambda: {'betas': [0.9, 0.998], 'weight_decay': 1e-6}) + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6}) lr_scheduler: str = "NoamLR" - lr_scheduler_params: dict = field(default_factory=lambda:{"warmup_steps": 4000}) + lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000}) grad_clip: float = 5.0 lr: float = 1e-3 diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index 15adff45..dc9c8e0d 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -171,7 +171,7 @@ class BaseTTSConfig(BaseTrainingConfig): optimizer: str = MISSING optimizer_params: dict = MISSING # scheduler - lr_scheduler: str = '' + lr_scheduler: str = "" lr_scheduler_params: dict = field(default_factory=lambda: {}) # testing - test_sentences: List[str] = field(default_factory=lambda:[]) + test_sentences: List[str] = field(default_factory=lambda: []) diff --git a/TTS/tts/configs/speedy_speech_config.py b/TTS/tts/configs/speedy_speech_config.py index 42258398..d76d94e2 100644 --- a/TTS/tts/configs/speedy_speech_config.py +++ b/TTS/tts/configs/speedy_speech_config.py @@ -101,7 +101,7 @@ class SpeedySpeechConfig(BaseTTSConfig): # optimizer parameters optimizer: str = "RAdam" - optimizer_params: dict = field(default_factory=lambda: {'betas': [0.9, 0.998], 'weight_decay': 1e-6}) + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6}) lr_scheduler: str = None lr_scheduler_params: dict = None lr: float = 1e-4 @@ -118,10 +118,12 @@ class SpeedySpeechConfig(BaseTTSConfig): r: int = 1 # DO NOT CHANGE # testing - test_sentences: List[str] = field(default_factory=lambda:[ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963." - ]) + test_sentences: List[str] = field( + default_factory=lambda: [ + "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + "Be a voice, not an echo.", + "I'm sorry Dave. I'm afraid I can't do that.", + "This cake is great. It's so delicious and moist.", + "Prior to November 22, 1963.", + ] + ) diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index 90decaa3..b197eaf6 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -160,9 +160,9 @@ class TacotronConfig(BaseTTSConfig): # optimizer parameters optimizer: str = "RAdam" - optimizer_params: dict = field(default_factory=lambda: {'betas': [0.9, 0.998], 'weight_decay': 1e-6}) + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6}) lr_scheduler: str = "NoamLR" - lr_scheduler_params: dict = field(default_factory=lambda:{"warmup_steps": 4000}) + lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000}) lr: float = 1e-4 grad_clip: float = 5.0 seq_len_norm: bool = False @@ -178,13 +178,15 @@ class TacotronConfig(BaseTTSConfig): ga_alpha: float = 5.0 # testing - test_sentences: List[str] = field(default_factory=lambda:[ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963." - ]) + test_sentences: List[str] = field( + default_factory=lambda: [ + "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + "Be a voice, not an echo.", + "I'm sorry Dave. I'm afraid I can't do that.", + "This cake is great. It's so delicious and moist.", + "Prior to November 22, 1963.", + ] + ) def check_values(self): if self.gradual_training: diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index b238209f..69ab871d 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -44,22 +44,18 @@ def load_meta_data(datasets, eval_split=True): preprocessor = _get_preprocessor_by_name(name) # load train set meta_data_train = preprocessor(root_path, meta_file_train) - print( - f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}" - ) + print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}") # load evaluation split if set if eval_split: if meta_file_val: meta_data_eval = preprocessor(root_path, meta_file_val) else: - meta_data_eval, meta_data_train = split_dataset( - meta_data_train) + meta_data_eval, meta_data_train = split_dataset(meta_data_train) meta_data_eval_all += meta_data_eval meta_data_train_all += meta_data_train # load attention masks for duration predictor training if dataset.meta_file_attn_mask: - meta_data = dict( - load_attention_mask_meta_data(dataset["meta_file_attn_mask"])) + meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"])) for idx, ins in enumerate(meta_data_train_all): attn_file = meta_data[ins[1]].strip() meta_data_train_all[idx].append(attn_file) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 517eb533..86d34c30 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -506,5 +506,10 @@ class AlignTTSLoss(nn.Module): spec_loss = self.spec_loss(decoder_output, decoder_target, decoder_output_lens) ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens) dur_loss = self.dur_loss(dur_output.unsqueeze(2), dur_target.unsqueeze(2), input_lens) - loss = self.spec_loss_alpha * spec_loss + self.ssim_alpha * ssim_loss + self.dur_loss_alpha * dur_loss + self.mdn_alpha * mdn_loss + loss = ( + self.spec_loss_alpha * spec_loss + + self.ssim_alpha * ssim_loss + + self.dur_loss_alpha * dur_loss + + self.mdn_alpha * mdn_loss + ) return {"loss": loss, "loss_l1": spec_loss, "loss_ssim": ssim_loss, "loss_dur": dur_loss, "mdn_loss": mdn_loss} diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 6d61eae2..f94d9ca6 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -72,19 +72,9 @@ class AlignTTS(nn.Module): hidden_channels=256, hidden_channels_dp=256, encoder_type="fftransformer", - encoder_params={ - "hidden_channels_ffn": 1024, - "num_heads": 2, - "num_layers": 6, - "dropout_p": 0.1 - }, + encoder_params={"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}, decoder_type="fftransformer", - decoder_params={ - "hidden_channels_ffn": 1024, - "num_heads": 2, - "num_layers": 6, - "dropout_p": 0.1 - }, + decoder_params={"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}, length_scale=1, num_speakers=0, external_c=False, @@ -93,14 +83,11 @@ class AlignTTS(nn.Module): super().__init__() self.phase = -1 - self.length_scale = float(length_scale) if isinstance( - length_scale, int) else length_scale + self.length_scale = float(length_scale) if isinstance(length_scale, int) else length_scale self.emb = nn.Embedding(num_chars, hidden_channels) self.pos_encoder = PositionalEncoding(hidden_channels) - self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, - encoder_params, c_in_channels) - self.decoder = Decoder(out_channels, hidden_channels, decoder_type, - decoder_params) + self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, encoder_params, c_in_channels) + self.decoder = Decoder(out_channels, hidden_channels, decoder_type, decoder_params) self.duration_predictor = DurationPredictor(hidden_channels_dp) self.mod_layer = nn.Conv1d(hidden_channels, hidden_channels, 1) @@ -121,9 +108,9 @@ class AlignTTS(nn.Module): mu = mu.transpose(1, 2).unsqueeze(2) # [B, T2, 1, D] log_sigma = log_sigma.transpose(1, 2).unsqueeze(2) # [B, T2, 1, D] expanded_y, expanded_mu = torch.broadcast_tensors(y, mu) - exponential = -0.5 * torch.mean(torch._C._nn.mse_loss( - expanded_y, expanded_mu, 0) / torch.pow(log_sigma.exp(), 2), - dim=-1) # B, L, T + exponential = -0.5 * torch.mean( + torch._C._nn.mse_loss(expanded_y, expanded_mu, 0) / torch.pow(log_sigma.exp(), 2), dim=-1 + ) # B, L, T logp = exponential - 0.5 * log_sigma.mean(dim=-1) return logp @@ -157,9 +144,7 @@ class AlignTTS(nn.Module): [1, 0, 0, 0, 0, 0, 0]] """ attn = self.convert_dr_to_align(dr, x_mask, y_mask) - o_en_ex = torch.matmul( - attn.squeeze(1).transpose(1, 2), en.transpose(1, - 2)).transpose(1, 2) + o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2), en.transpose(1, 2)).transpose(1, 2) return o_en_ex, attn def format_durations(self, o_dr_log, x_mask): @@ -193,8 +178,7 @@ class AlignTTS(nn.Module): x_emb = torch.transpose(x_emb, 1, -1) # compute sequence masks - x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), - 1).to(x.dtype) + x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).to(x.dtype) # encoder pass o_en = self.encoder(x_emb, x_mask) @@ -207,8 +191,7 @@ class AlignTTS(nn.Module): return o_en, o_en_dp, x_mask, g def _forward_decoder(self, o_en, o_en_dp, dr, x_mask, y_lengths, g): - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), - 1).to(o_en_dp.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype) # expand o_en with durations o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask) # positional encoding @@ -224,13 +207,13 @@ class AlignTTS(nn.Module): def _forward_mdn(self, o_en, y, y_lengths, x_mask): # MAS potentials and alignment mu, log_sigma = self.mdn_block(o_en) - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), - 1).to(o_en.dtype) - dr_mas, logp = self.compute_align_path(mu, log_sigma, y, x_mask, - y_mask) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en.dtype) + dr_mas, logp = self.compute_align_path(mu, log_sigma, y, x_mask, y_mask) return dr_mas, mu, log_sigma, logp - def forward(self, x, x_lengths, y, y_lengths, cond_input={"x_vectors": None}, phase=None): # pylint: disable=unused-argument + def forward( + self, x, x_lengths, y, y_lengths, cond_input={"x_vectors": None}, phase=None + ): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] @@ -240,83 +223,58 @@ class AlignTTS(nn.Module): g: [B, C] """ y = y.transpose(1, 2) - g = cond_input['x_vectors'] if 'x_vectors' in cond_input else None + g = cond_input["x_vectors"] if "x_vectors" in cond_input else None o_de, o_dr_log, dr_mas_log, attn, mu, log_sigma, logp = None, None, None, None, None, None, None if phase == 0: # train encoder and MDN o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) - dr_mas, mu, log_sigma, logp = self._forward_mdn( - o_en, y, y_lengths, x_mask) - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), - 1).to(o_en_dp.dtype) + dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype) attn = self.convert_dr_to_align(dr_mas, x_mask, y_mask) elif phase == 1: # train decoder o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) dr_mas, _, _, _ = self._forward_mdn(o_en, y, y_lengths, x_mask) - o_de, attn = self._forward_decoder(o_en.detach(), - o_en_dp.detach(), - dr_mas.detach(), - x_mask, - y_lengths, - g=g) + o_de, attn = self._forward_decoder(o_en.detach(), o_en_dp.detach(), dr_mas.detach(), x_mask, y_lengths, g=g) elif phase == 2: # train the whole except duration predictor o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) - dr_mas, mu, log_sigma, logp = self._forward_mdn( - o_en, y, y_lengths, x_mask) - o_de, attn = self._forward_decoder(o_en, - o_en_dp, - dr_mas, - x_mask, - y_lengths, - g=g) + dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask) + o_de, attn = self._forward_decoder(o_en, o_en_dp, dr_mas, x_mask, y_lengths, g=g) elif phase == 3: # train duration predictor o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) o_dr_log = self.duration_predictor(x, x_mask) - dr_mas, mu, log_sigma, logp = self._forward_mdn( - o_en, y, y_lengths, x_mask) - o_de, attn = self._forward_decoder(o_en, - o_en_dp, - dr_mas, - x_mask, - y_lengths, - g=g) + dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask) + o_de, attn = self._forward_decoder(o_en, o_en_dp, dr_mas, x_mask, y_lengths, g=g) o_dr_log = o_dr_log.squeeze(1) else: o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) - dr_mas, mu, log_sigma, logp = self._forward_mdn( - o_en, y, y_lengths, x_mask) - o_de, attn = self._forward_decoder(o_en, - o_en_dp, - dr_mas, - x_mask, - y_lengths, - g=g) + dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask) + o_de, attn = self._forward_decoder(o_en, o_en_dp, dr_mas, x_mask, y_lengths, g=g) o_dr_log = o_dr_log.squeeze(1) dr_mas_log = torch.log(dr_mas + 1).squeeze(1) outputs = { - 'model_outputs': o_de.transpose(1, 2), - 'alignments': attn, - 'durations_log': o_dr_log, - 'durations_mas_log': dr_mas_log, - 'mu': mu, - 'log_sigma': log_sigma, - 'logp': logp + "model_outputs": o_de.transpose(1, 2), + "alignments": attn, + "durations_log": o_dr_log, + "durations_mas_log": dr_mas_log, + "mu": mu, + "log_sigma": log_sigma, + "logp": logp, } return outputs @torch.no_grad() - def inference(self, x, cond_input={'x_vectors': None}): # pylint: disable=unused-argument + def inference(self, x, cond_input={"x_vectors": None}): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] x_lengths: [B] g: [B, C] """ - g = cond_input['x_vectors'] if 'x_vectors' in cond_input else None + g = cond_input["x_vectors"] if "x_vectors" in cond_input else None x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # pad input to prevent dropping the last word # x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0) @@ -326,46 +284,40 @@ class AlignTTS(nn.Module): # duration predictor pass o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1) y_lengths = o_dr.sum(1) - o_de, attn = self._forward_decoder(o_en, - o_en_dp, - o_dr, - x_mask, - y_lengths, - g=g) - outputs = {'model_outputs': o_de.transpose(1, 2), 'alignments': attn} + o_de, attn = self._forward_decoder(o_en, o_en_dp, o_dr, x_mask, y_lengths, g=g) + outputs = {"model_outputs": o_de.transpose(1, 2), "alignments": attn} return outputs def train_step(self, batch: dict, criterion: nn.Module): - text_input = batch['text_input'] - text_lengths = batch['text_lengths'] - mel_input = batch['mel_input'] - mel_lengths = batch['mel_lengths'] - x_vectors = batch['x_vectors'] - speaker_ids = batch['speaker_ids'] + text_input = batch["text_input"] + text_lengths = batch["text_lengths"] + mel_input = batch["mel_input"] + mel_lengths = batch["mel_lengths"] + x_vectors = batch["x_vectors"] + speaker_ids = batch["speaker_ids"] - cond_input = {'x_vectors': x_vectors, 'speaker_ids': speaker_ids} + cond_input = {"x_vectors": x_vectors, "speaker_ids": speaker_ids} outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input, self.phase) loss_dict = criterion( - outputs['logp'], - outputs['model_outputs'], - mel_input, - mel_lengths, - outputs['durations_log'], - outputs['durations_mas_log'], - text_lengths, - phase=self.phase, - ) + outputs["logp"], + outputs["model_outputs"], + mel_input, + mel_lengths, + outputs["durations_log"], + outputs["durations_mas_log"], + text_lengths, + phase=self.phase, + ) # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(outputs['alignments'], - binary=True) + align_error = 1 - alignment_diagonal_score(outputs["alignments"], binary=True) loss_dict["align_error"] = align_error return outputs, loss_dict def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): - model_outputs = outputs['model_outputs'] - alignments = outputs['alignments'] - mel_input = batch['mel_input'] + model_outputs = outputs["model_outputs"] + alignments = outputs["alignments"] + mel_input = batch["mel_input"] pred_spec = model_outputs[0].data.cpu().numpy() gt_spec = mel_input[0].data.cpu().numpy() @@ -387,7 +339,9 @@ class AlignTTS(nn.Module): def eval_log(self, ap: AudioProcessor, batch: dict, outputs: dict): return self.train_log(ap, batch, outputs) - def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint( + self, config, checkpoint_path, eval=False + ): # pylint: disable=unused-argument, redefined-builtin state = torch.load(checkpoint_path, map_location=torch.device("cpu")) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 09e58ce7..e1c07212 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -38,6 +38,7 @@ class GlowTTS(nn.Module): encoder_params (dict): encoder module parameters. speaker_embedding_dim (int): channels of external speaker embedding vectors. """ + def __init__( self, num_chars, @@ -132,17 +133,17 @@ class GlowTTS(nn.Module): @staticmethod def compute_outputs(attn, o_mean, o_log_scale, x_mask): # compute final values with the computed alignment - y_mean = torch.matmul( - attn.squeeze(1).transpose(1, 2), o_mean.transpose(1, 2)).transpose( - 1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] - y_log_scale = torch.matmul( - attn.squeeze(1).transpose(1, 2), o_log_scale.transpose( - 1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] + y_mean = torch.matmul(attn.squeeze(1).transpose(1, 2), o_mean.transpose(1, 2)).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + y_log_scale = torch.matmul(attn.squeeze(1).transpose(1, 2), o_log_scale.transpose(1, 2)).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] # compute total duration with adjustment o_attn_dur = torch.log(1 + torch.sum(attn, -1)) * x_mask return y_mean, y_log_scale, o_attn_dur - def forward(self, x, x_lengths, y, y_lengths=None, cond_input={'x_vectors':None}): + def forward(self, x, x_lengths, y, y_lengths=None, cond_input={"x_vectors": None}): """ Shapes: x: [B, T] @@ -154,7 +155,7 @@ class GlowTTS(nn.Module): y_max_length = y.size(2) y = y.transpose(1, 2) # norm speaker embeddings - g = cond_input['x_vectors'] + g = cond_input["x_vectors"] if g is not None: if self.speaker_embedding_dim: g = F.normalize(g).unsqueeze(-1) @@ -162,54 +163,38 @@ class GlowTTS(nn.Module): g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] # embedding pass - o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, - x_lengths, - g=g) + o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g) # drop redisual frames wrt num_squeeze and set y_lengths. - y, y_lengths, y_max_length, attn = self.preprocess( - y, y_lengths, y_max_length, None) + y, y_lengths, y_max_length, attn = self.preprocess(y, y_lengths, y_max_length, None) # create masks - y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), - 1).to(x_mask.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype) attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) # decoder pass z, logdet = self.decoder(y, y_mask, g=g, reverse=False) # find the alignment path with torch.no_grad(): o_scale = torch.exp(-2 * o_log_scale) - logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, - [1]).unsqueeze(-1) # [b, t, 1] - logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * - (z**2)) # [b, t, d] x [b, d, t'] = [b, t, t'] - logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), - z) # [b, t, d] x [b, d, t'] = [b, t, t'] - logp4 = torch.sum(-0.5 * (o_mean**2) * o_scale, - [1]).unsqueeze(-1) # [b, t, 1] + logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, [1]).unsqueeze(-1) # [b, t, 1] + logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * (z ** 2)) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), z) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp4 = torch.sum(-0.5 * (o_mean ** 2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] logp = logp1 + logp2 + logp3 + logp4 # [b, t, t'] - attn = maximum_path(logp, - attn_mask.squeeze(1)).unsqueeze(1).detach() - y_mean, y_log_scale, o_attn_dur = self.compute_outputs( - attn, o_mean, o_log_scale, x_mask) + attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() + y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask) attn = attn.squeeze(1).permute(0, 2, 1) outputs = { - 'model_outputs': z, - 'logdet': logdet, - 'y_mean': y_mean, - 'y_log_scale': y_log_scale, - 'alignments': attn, - 'durations_log': o_dur_log, - 'total_durations_log': o_attn_dur + "model_outputs": z, + "logdet": logdet, + "y_mean": y_mean, + "y_log_scale": y_log_scale, + "alignments": attn, + "durations_log": o_dur_log, + "total_durations_log": o_attn_dur, } return outputs @torch.no_grad() - def inference_with_MAS(self, - x, - x_lengths, - y=None, - y_lengths=None, - attn=None, - g=None): + def inference_with_MAS(self, x, x_lengths, y=None, y_lengths=None, attn=None, g=None): """ It's similar to the teacher forcing in Tacotron. It was proposed in: https://arxiv.org/abs/2104.05557 @@ -229,33 +214,24 @@ class GlowTTS(nn.Module): g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] # embedding pass - o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, - x_lengths, - g=g) + o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g) # drop redisual frames wrt num_squeeze and set y_lengths. - y, y_lengths, y_max_length, attn = self.preprocess( - y, y_lengths, y_max_length, None) + y, y_lengths, y_max_length, attn = self.preprocess(y, y_lengths, y_max_length, None) # create masks - y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), - 1).to(x_mask.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype) attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) # decoder pass z, logdet = self.decoder(y, y_mask, g=g, reverse=False) # find the alignment path between z and encoder output o_scale = torch.exp(-2 * o_log_scale) - logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, - [1]).unsqueeze(-1) # [b, t, 1] - logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * - (z**2)) # [b, t, d] x [b, d, t'] = [b, t, t'] - logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), - z) # [b, t, d] x [b, d, t'] = [b, t, t'] - logp4 = torch.sum(-0.5 * (o_mean**2) * o_scale, - [1]).unsqueeze(-1) # [b, t, 1] + logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, [1]).unsqueeze(-1) # [b, t, 1] + logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * (z ** 2)) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), z) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp4 = torch.sum(-0.5 * (o_mean ** 2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] logp = logp1 + logp2 + logp3 + logp4 # [b, t, t'] attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() - y_mean, y_log_scale, o_attn_dur = self.compute_outputs( - attn, o_mean, o_log_scale, x_mask) + y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask) attn = attn.squeeze(1).permute(0, 2, 1) # get predited aligned distribution @@ -264,13 +240,13 @@ class GlowTTS(nn.Module): # reverse the decoder and predict using the aligned distribution y, logdet = self.decoder(z, y_mask, g=g, reverse=True) outputs = { - 'model_outputs': y, - 'logdet': logdet, - 'y_mean': y_mean, - 'y_log_scale': y_log_scale, - 'alignments': attn, - 'durations_log': o_dur_log, - 'total_durations_log': o_attn_dur + "model_outputs": y, + "logdet": logdet, + "y_mean": y_mean, + "y_log_scale": y_log_scale, + "alignments": attn, + "durations_log": o_dur_log, + "total_durations_log": o_attn_dur, } return outputs @@ -290,8 +266,7 @@ class GlowTTS(nn.Module): else: g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] - y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), - 1).to(y.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(y.dtype) # decoder pass z, logdet = self.decoder(y, y_mask, g=g, reverse=False) @@ -310,37 +285,31 @@ class GlowTTS(nn.Module): g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h] # embedding pass - o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, - x_lengths, - g=g) + o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g) # compute output durations w = (torch.exp(o_dur_log) - 1) * x_mask * self.length_scale w_ceil = torch.ceil(w) y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() y_max_length = None # compute masks - y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), - 1).to(x_mask.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype) attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) # compute attention mask - attn = generate_path(w_ceil.squeeze(1), - attn_mask.squeeze(1)).unsqueeze(1) - y_mean, y_log_scale, o_attn_dur = self.compute_outputs( - attn, o_mean, o_log_scale, x_mask) + attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1) + y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask) - z = (y_mean + torch.exp(y_log_scale) * torch.randn_like(y_mean) * - self.inference_noise_scale) * y_mask + z = (y_mean + torch.exp(y_log_scale) * torch.randn_like(y_mean) * self.inference_noise_scale) * y_mask # decoder pass y, logdet = self.decoder(z, y_mask, g=g, reverse=True) attn = attn.squeeze(1).permute(0, 2, 1) outputs = { - 'model_outputs': y, - 'logdet': logdet, - 'y_mean': y_mean, - 'y_log_scale': y_log_scale, - 'alignments': attn, - 'durations_log': o_dur_log, - 'total_durations_log': o_attn_dur + "model_outputs": y, + "logdet": logdet, + "y_mean": y_mean, + "y_log_scale": y_log_scale, + "alignments": attn, + "durations_log": o_dur_log, + "total_durations_log": o_attn_dur, } return outputs @@ -351,32 +320,34 @@ class GlowTTS(nn.Module): batch (dict): [description] criterion (nn.Module): [description] """ - text_input = batch['text_input'] - text_lengths = batch['text_lengths'] - mel_input = batch['mel_input'] - mel_lengths = batch['mel_lengths'] - x_vectors = batch['x_vectors'] + text_input = batch["text_input"] + text_lengths = batch["text_lengths"] + mel_input = batch["mel_input"] + mel_lengths = batch["mel_lengths"] + x_vectors = batch["x_vectors"] - outputs = self.forward(text_input, - text_lengths, - mel_input, - mel_lengths, - cond_input={"x_vectors": x_vectors}) + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input={"x_vectors": x_vectors}) - loss_dict = criterion(outputs['model_outputs'], outputs['y_mean'], - outputs['y_log_scale'], outputs['logdet'], - mel_lengths, outputs['durations_log'], - outputs['total_durations_log'], text_lengths) + loss_dict = criterion( + outputs["model_outputs"], + outputs["y_mean"], + outputs["y_log_scale"], + outputs["logdet"], + mel_lengths, + outputs["durations_log"], + outputs["total_durations_log"], + text_lengths, + ) - # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(outputs['alignments'], binary=True) + # compute alignment error (the lower the better ) + align_error = 1 - alignment_diagonal_score(outputs["alignments"], binary=True) loss_dict["align_error"] = align_error return outputs, loss_dict def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): - model_outputs = outputs['model_outputs'] - alignments = outputs['alignments'] - mel_input = batch['mel_input'] + model_outputs = outputs["model_outputs"] + alignments = outputs["alignments"] + mel_input = batch["mel_input"] pred_spec = model_outputs[0].data.cpu().numpy() gt_spec = mel_input[0].data.cpu().numpy() @@ -400,8 +371,7 @@ class GlowTTS(nn.Module): def preprocess(self, y, y_lengths, y_max_length, attn=None): if y_max_length is not None: - y_max_length = (y_max_length // - self.num_squeeze) * self.num_squeeze + y_max_length = (y_max_length // self.num_squeeze) * self.num_squeeze y = y[:, :, :y_max_length] if attn is not None: attn = attn[:, :, :, :y_max_length] @@ -411,7 +381,9 @@ class GlowTTS(nn.Module): def store_inverse(self): self.decoder.store_inverse() - def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint( + self, config, checkpoint_path, eval=False + ): # pylint: disable=unused-argument, redefined-builtin state = torch.load(checkpoint_path, map_location=torch.device("cpu")) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index daf67b6c..69070ffa 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -49,12 +49,7 @@ class SpeedySpeech(nn.Module): positional_encoding=True, length_scale=1, encoder_type="residual_conv_bn", - encoder_params={ - "kernel_size": 4, - "dilations": 4 * [1, 2, 4] + [1], - "num_conv_blocks": 2, - "num_res_blocks": 13 - }, + encoder_params={"kernel_size": 4, "dilations": 4 * [1, 2, 4] + [1], "num_conv_blocks": 2, "num_res_blocks": 13}, decoder_type="residual_conv_bn", decoder_params={ "kernel_size": 4, @@ -68,17 +63,13 @@ class SpeedySpeech(nn.Module): ): super().__init__() - self.length_scale = float(length_scale) if isinstance( - length_scale, int) else length_scale + self.length_scale = float(length_scale) if isinstance(length_scale, int) else length_scale self.emb = nn.Embedding(num_chars, hidden_channels) - self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, - encoder_params, c_in_channels) + self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, encoder_params, c_in_channels) if positional_encoding: self.pos_encoder = PositionalEncoding(hidden_channels) - self.decoder = Decoder(out_channels, hidden_channels, decoder_type, - decoder_params) - self.duration_predictor = DurationPredictor(hidden_channels + - c_in_channels) + self.decoder = Decoder(out_channels, hidden_channels, decoder_type, decoder_params) + self.duration_predictor = DurationPredictor(hidden_channels + c_in_channels) if num_speakers > 1 and not external_c: # speaker embedding layer @@ -105,9 +96,7 @@ class SpeedySpeech(nn.Module): """ attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) attn = generate_path(dr, attn_mask.squeeze(1)).to(en.dtype) - o_en_ex = torch.matmul( - attn.squeeze(1).transpose(1, 2), en.transpose(1, - 2)).transpose(1, 2) + o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2), en.transpose(1, 2)).transpose(1, 2) return o_en_ex, attn def format_durations(self, o_dr_log, x_mask): @@ -141,8 +130,7 @@ class SpeedySpeech(nn.Module): x_emb = torch.transpose(x_emb, 1, -1) # compute sequence masks - x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), - 1).to(x.dtype) + x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).to(x.dtype) # encoder pass o_en = self.encoder(x_emb, x_mask) @@ -155,8 +143,7 @@ class SpeedySpeech(nn.Module): return o_en, o_en_dp, x_mask, g def _forward_decoder(self, o_en, o_en_dp, dr, x_mask, y_lengths, g): - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), - 1).to(o_en_dp.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype) # expand o_en with durations o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask) # positional encoding @@ -169,15 +156,9 @@ class SpeedySpeech(nn.Module): o_de = self.decoder(o_en_ex, y_mask, g=g) return o_de, attn.transpose(1, 2) - def forward(self, - x, - x_lengths, - y_lengths, - dr, - cond_input={ - 'x_vectors': None, - 'speaker_ids': None - }): # pylint: disable=unused-argument + def forward( + self, x, x_lengths, y_lengths, dr, cond_input={"x_vectors": None, "speaker_ids": None} + ): # pylint: disable=unused-argument """ TODO: speaker embedding for speaker_ids Shapes: @@ -187,91 +168,68 @@ class SpeedySpeech(nn.Module): dr: [B, T_max] g: [B, C] """ - g = cond_input['x_vectors'] if 'x_vectors' in cond_input else None + g = cond_input["x_vectors"] if "x_vectors" in cond_input else None o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) - o_de, attn = self._forward_decoder(o_en, - o_en_dp, - dr, - x_mask, - y_lengths, - g=g) - outputs = { - 'model_outputs': o_de.transpose(1, 2), - 'durations_log': o_dr_log.squeeze(1), - 'alignments': attn - } + o_de, attn = self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g) + outputs = {"model_outputs": o_de.transpose(1, 2), "durations_log": o_dr_log.squeeze(1), "alignments": attn} return outputs - def inference(self, - x, - cond_input={ - 'x_vectors': None, - 'speaker_ids': None - }): # pylint: disable=unused-argument + def inference(self, x, cond_input={"x_vectors": None, "speaker_ids": None}): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] x_lengths: [B] g: [B, C] """ - g = cond_input['x_vectors'] if 'x_vectors' in cond_input else None + g = cond_input["x_vectors"] if "x_vectors" in cond_input else None x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # input sequence should be greated than the max convolution size inference_padding = 5 if x.shape[1] < 13: inference_padding += 13 - x.shape[1] # pad input to prevent dropping the last word - x = torch.nn.functional.pad(x, - pad=(0, inference_padding), - mode="constant", - value=0) + x = torch.nn.functional.pad(x, pad=(0, inference_padding), mode="constant", value=0) o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) # duration predictor pass o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1) y_lengths = o_dr.sum(1) - o_de, attn = self._forward_decoder(o_en, - o_en_dp, - o_dr, - x_mask, - y_lengths, - g=g) - outputs = { - 'model_outputs': o_de.transpose(1, 2), - 'alignments': attn, - 'durations_log': None - } + o_de, attn = self._forward_decoder(o_en, o_en_dp, o_dr, x_mask, y_lengths, g=g) + outputs = {"model_outputs": o_de.transpose(1, 2), "alignments": attn, "durations_log": None} return outputs def train_step(self, batch: dict, criterion: nn.Module): - text_input = batch['text_input'] - text_lengths = batch['text_lengths'] - mel_input = batch['mel_input'] - mel_lengths = batch['mel_lengths'] - x_vectors = batch['x_vectors'] - speaker_ids = batch['speaker_ids'] - durations = batch['durations'] + text_input = batch["text_input"] + text_lengths = batch["text_lengths"] + mel_input = batch["mel_input"] + mel_lengths = batch["mel_lengths"] + x_vectors = batch["x_vectors"] + speaker_ids = batch["speaker_ids"] + durations = batch["durations"] - cond_input = {'x_vectors': x_vectors, 'speaker_ids': speaker_ids} - outputs = self.forward(text_input, text_lengths, mel_lengths, - durations, cond_input) + cond_input = {"x_vectors": x_vectors, "speaker_ids": speaker_ids} + outputs = self.forward(text_input, text_lengths, mel_lengths, durations, cond_input) # compute loss - loss_dict = criterion(outputs['model_outputs'], mel_input, - mel_lengths, outputs['durations_log'], - torch.log(1 + durations), text_lengths) + loss_dict = criterion( + outputs["model_outputs"], + mel_input, + mel_lengths, + outputs["durations_log"], + torch.log(1 + durations), + text_lengths, + ) # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(outputs['alignments'], - binary=True) + align_error = 1 - alignment_diagonal_score(outputs["alignments"], binary=True) loss_dict["align_error"] = align_error return outputs, loss_dict def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): - model_outputs = outputs['model_outputs'] - alignments = outputs['alignments'] - mel_input = batch['mel_input'] + model_outputs = outputs["model_outputs"] + alignments = outputs["alignments"] + mel_input = batch["mel_input"] pred_spec = model_outputs[0].data.cpu().numpy() gt_spec = mel_input[0].data.cpu().numpy() @@ -293,7 +251,9 @@ class SpeedySpeech(nn.Module): def eval_log(self, ap: AudioProcessor, batch: dict, outputs: dict): return self.train_log(ap, batch, outputs) - def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint( + self, config, checkpoint_path, eval=False + ): # pylint: disable=unused-argument, redefined-builtin state = torch.load(checkpoint_path, map_location=torch.device("cpu")) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 34f04159..19af28ff 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -50,6 +50,7 @@ class Tacotron(TacotronAbstract): gradual_trainin (List): Gradual training schedule. If None or `[]`, no gradual training is used. Defaults to `[]`. """ + def __init__( self, num_chars, @@ -78,7 +79,7 @@ class Tacotron(TacotronAbstract): use_gst=False, gst=None, memory_size=5, - gradual_training=[] + gradual_training=[], ): super().__init__( num_chars, @@ -106,15 +107,14 @@ class Tacotron(TacotronAbstract): speaker_embedding_dim, use_gst, gst, - gradual_training + gradual_training, ) # speaker embedding layers if self.num_speakers > 1: if not self.embeddings_per_sample: speaker_embedding_dim = 256 - self.speaker_embedding = nn.Embedding(self.num_speakers, - speaker_embedding_dim) + self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) # speaker and gst embeddings is concat in decoder input @@ -145,8 +145,7 @@ class Tacotron(TacotronAbstract): separate_stopnet, ) self.postnet = PostCBHG(decoder_output_dim) - self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, - postnet_output_dim) + self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, postnet_output_dim) # setup prenet dropout self.decoder.prenet.dropout_at_inference = prenet_dropout_at_inference @@ -183,12 +182,7 @@ class Tacotron(TacotronAbstract): separate_stopnet, ) - def forward(self, - text, - text_lengths, - mel_specs=None, - mel_lengths=None, - cond_input=None): + def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, cond_input=None): """ Shapes: text: [B, T_in] @@ -197,100 +191,87 @@ class Tacotron(TacotronAbstract): mel_lengths: [B] cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] """ - outputs = { - 'alignments_backward': None, - 'decoder_outputs_backward': None - } + outputs = {"alignments_backward": None, "decoder_outputs_backward": None} input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) # B x T_in x embed_dim inputs = self.embedding(text) # B x T_in x encoder_in_features encoder_outputs = self.encoder(inputs) # sequence masking - encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as( - encoder_outputs) + encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) # global style token if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, - cond_input['x_vectors']) + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["x_vectors"]) # speaker embedding if self.num_speakers > 1: if not self.embeddings_per_sample: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(cond_input['speaker_ids'])[:, - None] + speaker_embeddings = self.speaker_embedding(cond_input["speaker_ids"])[:, None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(cond_input['x_vectors'], 1) - encoder_outputs = self._concat_speaker_embedding( - encoder_outputs, speaker_embeddings) + speaker_embeddings = torch.unsqueeze(cond_input["x_vectors"], 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) # decoder_outputs: B x decoder_in_features x T_out # alignments: B x T_in x encoder_in_features # stop_tokens: B x T_in - decoder_outputs, alignments, stop_tokens = self.decoder( - encoder_outputs, mel_specs, input_mask) + decoder_outputs, alignments, stop_tokens = self.decoder(encoder_outputs, mel_specs, input_mask) # sequence masking if output_mask is not None: - decoder_outputs = decoder_outputs * output_mask.unsqueeze( - 1).expand_as(decoder_outputs) + decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs) # B x T_out x decoder_in_features postnet_outputs = self.postnet(decoder_outputs) # sequence masking if output_mask is not None: - postnet_outputs = postnet_outputs * output_mask.unsqueeze( - 2).expand_as(postnet_outputs) + postnet_outputs = postnet_outputs * output_mask.unsqueeze(2).expand_as(postnet_outputs) # B x T_out x posnet_dim postnet_outputs = self.last_linear(postnet_outputs) # B x T_out x decoder_in_features decoder_outputs = decoder_outputs.transpose(1, 2).contiguous() if self.bidirectional_decoder: - decoder_outputs_backward, alignments_backward = self._backward_pass( - mel_specs, encoder_outputs, input_mask) - outputs['alignments_backward'] = alignments_backward - outputs['decoder_outputs_backward'] = decoder_outputs_backward + decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask) + outputs["alignments_backward"] = alignments_backward + outputs["decoder_outputs_backward"] = decoder_outputs_backward if self.double_decoder_consistency: decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass( - mel_specs, encoder_outputs, alignments, input_mask) - outputs['alignments_backward'] = alignments_backward - outputs['decoder_outputs_backward'] = decoder_outputs_backward - outputs.update({ - 'model_outputs': postnet_outputs, - 'decoder_outputs': decoder_outputs, - 'alignments': alignments, - 'stop_tokens': stop_tokens - }) + mel_specs, encoder_outputs, alignments, input_mask + ) + outputs["alignments_backward"] = alignments_backward + outputs["decoder_outputs_backward"] = decoder_outputs_backward + outputs.update( + { + "model_outputs": postnet_outputs, + "decoder_outputs": decoder_outputs, + "alignments": alignments, + "stop_tokens": stop_tokens, + } + ) return outputs @torch.no_grad() - def inference(self, - text_input, - cond_input=None): + def inference(self, text_input, cond_input=None): inputs = self.embedding(text_input) encoder_outputs = self.encoder(inputs) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, cond_input['style_mel'], - cond_input['x_vectors']) + encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["x_vectors"]) if self.num_speakers > 1: if not self.embeddings_per_sample: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(cond_input['speaker_ids'])[:, None] + speaker_embeddings = self.speaker_embedding(cond_input["speaker_ids"])[:, None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(cond_input['x_vectors'], 1) - encoder_outputs = self._concat_speaker_embedding( - encoder_outputs, speaker_embeddings) - decoder_outputs, alignments, stop_tokens = self.decoder.inference( - encoder_outputs) + speaker_embeddings = torch.unsqueeze(cond_input["x_vectors"], 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = self.last_linear(postnet_outputs) decoder_outputs = decoder_outputs.transpose(1, 2) outputs = { - 'model_outputs': postnet_outputs, - 'decoder_outputs': decoder_outputs, - 'alignments': alignments, - 'stop_tokens': stop_tokens + "model_outputs": postnet_outputs, + "decoder_outputs": decoder_outputs, + "alignments": alignments, + "stop_tokens": stop_tokens, } return outputs @@ -301,64 +282,61 @@ class Tacotron(TacotronAbstract): batch ([type]): [description] criterion ([type]): [description] """ - text_input = batch['text_input'] - text_lengths = batch['text_lengths'] - mel_input = batch['mel_input'] - mel_lengths = batch['mel_lengths'] - linear_input = batch['linear_input'] - stop_targets = batch['stop_targets'] - speaker_ids = batch['speaker_ids'] - x_vectors = batch['x_vectors'] + text_input = batch["text_input"] + text_lengths = batch["text_lengths"] + mel_input = batch["mel_input"] + mel_lengths = batch["mel_lengths"] + linear_input = batch["linear_input"] + stop_targets = batch["stop_targets"] + speaker_ids = batch["speaker_ids"] + x_vectors = batch["x_vectors"] # forward pass model - outputs = self.forward(text_input, - text_lengths, - mel_input, - mel_lengths, - cond_input={ - 'speaker_ids': speaker_ids, - 'x_vectors': x_vectors - }) + outputs = self.forward( + text_input, + text_lengths, + mel_input, + mel_lengths, + cond_input={"speaker_ids": speaker_ids, "x_vectors": x_vectors}, + ) # set the [alignment] lengths wrt reduction factor for guided attention if mel_lengths.max() % self.decoder.r != 0: alignment_lengths = ( - mel_lengths + - (self.decoder.r - - (mel_lengths.max() % self.decoder.r))) // self.decoder.r + mel_lengths + (self.decoder.r - (mel_lengths.max() % self.decoder.r)) + ) // self.decoder.r else: alignment_lengths = mel_lengths // self.decoder.r - cond_input = {'speaker_ids': speaker_ids, 'x_vectors': x_vectors} - outputs = self.forward(text_input, text_lengths, mel_input, - mel_lengths, cond_input) + cond_input = {"speaker_ids": speaker_ids, "x_vectors": x_vectors} + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input) # compute loss loss_dict = criterion( - outputs['model_outputs'], - outputs['decoder_outputs'], + outputs["model_outputs"], + outputs["decoder_outputs"], mel_input, linear_input, - outputs['stop_tokens'], + outputs["stop_tokens"], stop_targets, mel_lengths, - outputs['decoder_outputs_backward'], - outputs['alignments'], + outputs["decoder_outputs_backward"], + outputs["alignments"], alignment_lengths, - outputs['alignments_backward'], + outputs["alignments_backward"], text_lengths, ) # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(outputs['alignments']) + align_error = 1 - alignment_diagonal_score(outputs["alignments"]) loss_dict["align_error"] = align_error return outputs, loss_dict def train_log(self, ap, batch, outputs): - postnet_outputs = outputs['model_outputs'] - alignments = outputs['alignments'] - alignments_backward = outputs['alignments_backward'] - mel_input = batch['mel_input'] + postnet_outputs = outputs["model_outputs"] + alignments = outputs["alignments"] + alignments_backward = outputs["alignments_backward"] + mel_input = batch["mel_input"] pred_spec = postnet_outputs[0].data.cpu().numpy() gt_spec = mel_input[0].data.cpu().numpy() @@ -371,8 +349,7 @@ class Tacotron(TacotronAbstract): } if self.bidirectional_decoder or self.double_decoder_consistency: - figures["alignment_backward"] = plot_alignment( - alignments_backward[0].data.cpu().numpy(), output_fig=False) + figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy(), output_fig=False) # Sample audio train_audio = ap.inv_spectrogram(pred_spec.T) @@ -382,4 +359,4 @@ class Tacotron(TacotronAbstract): return self.train_step(batch, criterion) def eval_log(self, ap, batch, outputs): - return self.train_log(ap, batch, outputs) \ No newline at end of file + return self.train_log(ap, batch, outputs) diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 04b97606..4e111fda 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -49,49 +49,70 @@ class Tacotron2(TacotronAbstract): gradual_trainin (List): Gradual training schedule. If None or `[]`, no gradual training is used. Defaults to `[]`. """ - def __init__(self, - num_chars, - num_speakers, - r, - postnet_output_dim=80, - decoder_output_dim=80, - attn_type="original", - attn_win=False, - attn_norm="softmax", - prenet_type="original", - prenet_dropout=True, - prenet_dropout_at_inference=False, - forward_attn=False, - trans_agent=False, - forward_attn_mask=False, - location_attn=True, - attn_K=5, - separate_stopnet=True, - bidirectional_decoder=False, - double_decoder_consistency=False, - ddc_r=None, - encoder_in_features=512, - decoder_in_features=512, - speaker_embedding_dim=None, - use_gst=False, - gst=None, - gradual_training=[]): - super().__init__(num_chars, num_speakers, r, postnet_output_dim, - decoder_output_dim, attn_type, attn_win, attn_norm, - prenet_type, prenet_dropout, - prenet_dropout_at_inference, forward_attn, - trans_agent, forward_attn_mask, location_attn, attn_K, - separate_stopnet, bidirectional_decoder, - double_decoder_consistency, ddc_r, - encoder_in_features, decoder_in_features, - speaker_embedding_dim, use_gst, gst, gradual_training) + + def __init__( + self, + num_chars, + num_speakers, + r, + postnet_output_dim=80, + decoder_output_dim=80, + attn_type="original", + attn_win=False, + attn_norm="softmax", + prenet_type="original", + prenet_dropout=True, + prenet_dropout_at_inference=False, + forward_attn=False, + trans_agent=False, + forward_attn_mask=False, + location_attn=True, + attn_K=5, + separate_stopnet=True, + bidirectional_decoder=False, + double_decoder_consistency=False, + ddc_r=None, + encoder_in_features=512, + decoder_in_features=512, + speaker_embedding_dim=None, + use_gst=False, + gst=None, + gradual_training=[], + ): + super().__init__( + num_chars, + num_speakers, + r, + postnet_output_dim, + decoder_output_dim, + attn_type, + attn_win, + attn_norm, + prenet_type, + prenet_dropout, + prenet_dropout_at_inference, + forward_attn, + trans_agent, + forward_attn_mask, + location_attn, + attn_K, + separate_stopnet, + bidirectional_decoder, + double_decoder_consistency, + ddc_r, + encoder_in_features, + decoder_in_features, + speaker_embedding_dim, + use_gst, + gst, + gradual_training, + ) # speaker embedding layer if self.num_speakers > 1: if not self.embeddings_per_sample: speaker_embedding_dim = 512 - self.speaker_embedding = nn.Embedding(self.num_speakers, - speaker_embedding_dim) + self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) # speaker and gst embeddings is concat in decoder input @@ -162,12 +183,7 @@ class Tacotron2(TacotronAbstract): mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2) return mel_outputs, mel_outputs_postnet, alignments - def forward(self, - text, - text_lengths, - mel_specs=None, - mel_lengths=None, - cond_input=None): + def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, cond_input=None): """ Shapes: text: [B, T_in] @@ -176,10 +192,7 @@ class Tacotron2(TacotronAbstract): mel_lengths: [B] cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] """ - outputs = { - 'alignments_backward': None, - 'decoder_outputs_backward': None - } + outputs = {"alignments_backward": None, "decoder_outputs_backward": None} # compute mask for padding # B x T_in_max (boolean) input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) @@ -189,55 +202,49 @@ class Tacotron2(TacotronAbstract): encoder_outputs = self.encoder(embedded_inputs, text_lengths) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, - cond_input['x_vectors']) + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["x_vectors"]) if self.num_speakers > 1: if not self.embeddings_per_sample: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(cond_input['speaker_ids'])[:, - None] + speaker_embeddings = self.speaker_embedding(cond_input["speaker_ids"])[:, None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(cond_input['x_vectors'], 1) - encoder_outputs = self._concat_speaker_embedding( - encoder_outputs, speaker_embeddings) + speaker_embeddings = torch.unsqueeze(cond_input["x_vectors"], 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) - encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as( - encoder_outputs) + encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) # B x mel_dim x T_out -- B x T_out//r x T_in -- B x T_out//r - decoder_outputs, alignments, stop_tokens = self.decoder( - encoder_outputs, mel_specs, input_mask) + decoder_outputs, alignments, stop_tokens = self.decoder(encoder_outputs, mel_specs, input_mask) # sequence masking if mel_lengths is not None: - decoder_outputs = decoder_outputs * output_mask.unsqueeze( - 1).expand_as(decoder_outputs) + decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs) # B x mel_dim x T_out postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = decoder_outputs + postnet_outputs # sequence masking if output_mask is not None: - postnet_outputs = postnet_outputs * output_mask.unsqueeze( - 1).expand_as(postnet_outputs) + postnet_outputs = postnet_outputs * output_mask.unsqueeze(1).expand_as(postnet_outputs) # B x T_out x mel_dim -- B x T_out x mel_dim -- B x T_out//r x T_in - decoder_outputs, postnet_outputs, alignments = self.shape_outputs( - decoder_outputs, postnet_outputs, alignments) + decoder_outputs, postnet_outputs, alignments = self.shape_outputs(decoder_outputs, postnet_outputs, alignments) if self.bidirectional_decoder: - decoder_outputs_backward, alignments_backward = self._backward_pass( - mel_specs, encoder_outputs, input_mask) - outputs['alignments_backward'] = alignments_backward - outputs['decoder_outputs_backward'] = decoder_outputs_backward + decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask) + outputs["alignments_backward"] = alignments_backward + outputs["decoder_outputs_backward"] = decoder_outputs_backward if self.double_decoder_consistency: decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass( - mel_specs, encoder_outputs, alignments, input_mask) - outputs['alignments_backward'] = alignments_backward - outputs['decoder_outputs_backward'] = decoder_outputs_backward - outputs.update({ - 'model_outputs': postnet_outputs, - 'decoder_outputs': decoder_outputs, - 'alignments': alignments, - 'stop_tokens': stop_tokens - }) + mel_specs, encoder_outputs, alignments, input_mask + ) + outputs["alignments_backward"] = alignments_backward + outputs["decoder_outputs_backward"] = decoder_outputs_backward + outputs.update( + { + "model_outputs": postnet_outputs, + "decoder_outputs": decoder_outputs, + "alignments": alignments, + "stop_tokens": stop_tokens, + } + ) return outputs @torch.no_grad() @@ -247,29 +254,25 @@ class Tacotron2(TacotronAbstract): if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, cond_input['style_mel'], - cond_input['x_vectors']) + encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["x_vectors"]) if self.num_speakers > 1: if not self.embeddings_per_sample: x_vector = self.speaker_embedding(cond_input['speaker_ids'])[:, None] x_vector = torch.unsqueeze(x_vector, 0).transpose(1, 2) else: - x_vector = cond_input['x_vectors'] + x_vector = cond_input["x_vectors"] - encoder_outputs = self._concat_speaker_embedding( - encoder_outputs, x_vector) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, x_vector) - decoder_outputs, alignments, stop_tokens = self.decoder.inference( - encoder_outputs) + decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = decoder_outputs + postnet_outputs - decoder_outputs, postnet_outputs, alignments = self.shape_outputs( - decoder_outputs, postnet_outputs, alignments) + decoder_outputs, postnet_outputs, alignments = self.shape_outputs(decoder_outputs, postnet_outputs, alignments) outputs = { - 'model_outputs': postnet_outputs, - 'decoder_outputs': decoder_outputs, - 'alignments': alignments, - 'stop_tokens': stop_tokens + "model_outputs": postnet_outputs, + "decoder_outputs": decoder_outputs, + "alignments": alignments, + "stop_tokens": stop_tokens, } return outputs @@ -280,64 +283,61 @@ class Tacotron2(TacotronAbstract): batch ([type]): [description] criterion ([type]): [description] """ - text_input = batch['text_input'] - text_lengths = batch['text_lengths'] - mel_input = batch['mel_input'] - mel_lengths = batch['mel_lengths'] - linear_input = batch['linear_input'] - stop_targets = batch['stop_targets'] - speaker_ids = batch['speaker_ids'] - x_vectors = batch['x_vectors'] + text_input = batch["text_input"] + text_lengths = batch["text_lengths"] + mel_input = batch["mel_input"] + mel_lengths = batch["mel_lengths"] + linear_input = batch["linear_input"] + stop_targets = batch["stop_targets"] + speaker_ids = batch["speaker_ids"] + x_vectors = batch["x_vectors"] # forward pass model - outputs = self.forward(text_input, - text_lengths, - mel_input, - mel_lengths, - cond_input={ - 'speaker_ids': speaker_ids, - 'x_vectors': x_vectors - }) + outputs = self.forward( + text_input, + text_lengths, + mel_input, + mel_lengths, + cond_input={"speaker_ids": speaker_ids, "x_vectors": x_vectors}, + ) # set the [alignment] lengths wrt reduction factor for guided attention if mel_lengths.max() % self.decoder.r != 0: alignment_lengths = ( - mel_lengths + - (self.decoder.r - - (mel_lengths.max() % self.decoder.r))) // self.decoder.r + mel_lengths + (self.decoder.r - (mel_lengths.max() % self.decoder.r)) + ) // self.decoder.r else: alignment_lengths = mel_lengths // self.decoder.r - cond_input = {'speaker_ids': speaker_ids, 'x_vectors': x_vectors} - outputs = self.forward(text_input, text_lengths, mel_input, - mel_lengths, cond_input) + cond_input = {"speaker_ids": speaker_ids, "x_vectors": x_vectors} + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input) # compute loss loss_dict = criterion( - outputs['model_outputs'], - outputs['decoder_outputs'], + outputs["model_outputs"], + outputs["decoder_outputs"], mel_input, linear_input, - outputs['stop_tokens'], + outputs["stop_tokens"], stop_targets, mel_lengths, - outputs['decoder_outputs_backward'], - outputs['alignments'], + outputs["decoder_outputs_backward"], + outputs["alignments"], alignment_lengths, - outputs['alignments_backward'], + outputs["alignments_backward"], text_lengths, ) # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(outputs['alignments']) + align_error = 1 - alignment_diagonal_score(outputs["alignments"]) loss_dict["align_error"] = align_error return outputs, loss_dict def train_log(self, ap, batch, outputs): - postnet_outputs = outputs['model_outputs'] - alignments = outputs['alignments'] - alignments_backward = outputs['alignments_backward'] - mel_input = batch['mel_input'] + postnet_outputs = outputs["model_outputs"] + alignments = outputs["alignments"] + alignments_backward = outputs["alignments_backward"] + mel_input = batch["mel_input"] pred_spec = postnet_outputs[0].data.cpu().numpy() gt_spec = mel_input[0].data.cpu().numpy() @@ -350,8 +350,7 @@ class Tacotron2(TacotronAbstract): } if self.bidirectional_decoder or self.double_decoder_consistency: - figures["alignment_backward"] = plot_alignment( - alignments_backward[0].data.cpu().numpy(), output_fig=False) + figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy(), output_fig=False) # Sample audio train_audio = ap.inv_melspectrogram(pred_spec.T) diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py index 2bea06a9..49487b67 100644 --- a/TTS/tts/models/tacotron_abstract.py +++ b/TTS/tts/models/tacotron_abstract.py @@ -37,7 +37,7 @@ class TacotronAbstract(ABC, nn.Module): speaker_embedding_dim=None, use_gst=False, gst=None, - gradual_training=[] + gradual_training=[], ): """Abstract Tacotron class""" super().__init__() diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 93d023cb..7c896dfe 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -58,25 +58,16 @@ def numpy_to_tf(np_array, dtype): def compute_style_mel(style_wav, ap, cuda=False): - style_mel = torch.FloatTensor( - ap.melspectrogram(ap.load_wav(style_wav, - sr=ap.sample_rate))).unsqueeze(0) + style_mel = torch.FloatTensor(ap.melspectrogram(ap.load_wav(style_wav, sr=ap.sample_rate))).unsqueeze(0) if cuda: return style_mel.cuda() return style_mel -def run_model_torch(model, - inputs, - speaker_id=None, - style_mel=None, - x_vector=None): - outputs = model.inference(inputs, - cond_input={ - 'speaker_ids': speaker_id, - 'x_vector': x_vector, - 'style_mel': style_mel - }) +def run_model_torch(model, inputs, speaker_id=None, style_mel=None, x_vector=None): + outputs = model.inference( + inputs, cond_input={"speaker_ids": speaker_id, "x_vector": x_vector, "style_mel": style_mel} + ) return outputs @@ -86,18 +77,15 @@ def run_model_tf(model, inputs, CONFIG, speaker_id=None, style_mel=None): if speaker_id is not None: raise NotImplementedError(" [!] Multi-Speaker not implemented for TF") # TODO: handle multispeaker case - decoder_output, postnet_output, alignments, stop_tokens = model( - inputs, training=False) + decoder_output, postnet_output, alignments, stop_tokens = model(inputs, training=False) return decoder_output, postnet_output, alignments, stop_tokens def run_model_tflite(model, inputs, CONFIG, speaker_id=None, style_mel=None): if CONFIG.gst and style_mel is not None: - raise NotImplementedError( - " [!] GST inference not implemented for TfLite") + raise NotImplementedError(" [!] GST inference not implemented for TfLite") if speaker_id is not None: - raise NotImplementedError( - " [!] Multi-Speaker not implemented for TfLite") + raise NotImplementedError(" [!] Multi-Speaker not implemented for TfLite") # get input and output details input_details = model.get_input_details() output_details = model.get_output_details() @@ -131,7 +119,7 @@ def parse_outputs_tflite(postnet_output, decoder_output): def trim_silence(wav, ap): - return wav[:ap.find_endpoint(wav)] + return wav[: ap.find_endpoint(wav)] def inv_spectrogram(postnet_output, ap, CONFIG): @@ -154,8 +142,7 @@ def speaker_id_to_torch(speaker_id, cuda=False): def embedding_to_torch(x_vector, cuda=False): if x_vector is not None: x_vector = np.asarray(x_vector) - x_vector = torch.from_numpy(x_vector).unsqueeze(0).type( - torch.FloatTensor) + x_vector = torch.from_numpy(x_vector).unsqueeze(0).type(torch.FloatTensor) if cuda: return x_vector.cuda() return x_vector @@ -172,8 +159,7 @@ def apply_griffin_lim(inputs, input_lens, CONFIG, ap): """ wavs = [] for idx, spec in enumerate(inputs): - wav_len = (input_lens[idx] * - ap.hop_length) - ap.hop_length # inverse librosa padding + wav_len = (input_lens[idx] * ap.hop_length) - ap.hop_length # inverse librosa padding wav = inv_spectrogram(spec, ap, CONFIG) # assert len(wav) == wav_len, f" [!] wav lenght: {len(wav)} vs expected: {wav_len}" wavs.append(wav[:wav_len]) @@ -241,23 +227,21 @@ def synthesis( text_inputs = tf.expand_dims(text_inputs, 0) # synthesize voice if backend == "torch": - outputs = run_model_torch(model, - text_inputs, - speaker_id, - style_mel, - x_vector=x_vector) - model_outputs = outputs['model_outputs'] + outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, x_vector=x_vector) + model_outputs = outputs["model_outputs"] model_outputs = model_outputs[0].data.cpu().numpy() elif backend == "tf": decoder_output, postnet_output, alignments, stop_tokens = run_model_tf( - model, text_inputs, CONFIG, speaker_id, style_mel) + model, text_inputs, CONFIG, speaker_id, style_mel + ) model_outputs, decoder_output, alignment, stop_tokens = parse_outputs_tf( - postnet_output, decoder_output, alignments, stop_tokens) + postnet_output, decoder_output, alignments, stop_tokens + ) elif backend == "tflite": decoder_output, postnet_output, alignment, stop_tokens = run_model_tflite( - model, text_inputs, CONFIG, speaker_id, style_mel) - model_outputs, decoder_output = parse_outputs_tflite( - postnet_output, decoder_output) + model, text_inputs, CONFIG, speaker_id, style_mel + ) + model_outputs, decoder_output = parse_outputs_tflite(postnet_output, decoder_output) # convert outputs to numpy # plot results wav = None @@ -267,9 +251,9 @@ def synthesis( if do_trim_silence: wav = trim_silence(wav, ap) return_dict = { - 'wav': wav, - 'alignments': outputs['alignments'], - 'model_outputs': model_outputs, - 'text_inputs': text_inputs + "wav": wav, + "alignments": outputs["alignments"], + "model_outputs": model_outputs, + "text_inputs": text_inputs, } return return_dict diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 3fc63e26..5e6acd1d 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -30,16 +30,16 @@ def init_arguments(argv): parser.add_argument( "--continue_path", type=str, - help=("Training output folder to continue training. Used to continue " - "a training. If it is used, 'config_path' is ignored."), + help=( + "Training output folder to continue training. Used to continue " + "a training. If it is used, 'config_path' is ignored." + ), default="", required="--config_path" not in argv, ) parser.add_argument( - "--restore_path", - type=str, - help="Model file to be restored. Use to finetune a model.", - default="") + "--restore_path", type=str, help="Model file to be restored. Use to finetune a model.", default="" + ) parser.add_argument( "--best_path", type=str, @@ -49,23 +49,12 @@ def init_arguments(argv): ), default="", ) - parser.add_argument("--config_path", - type=str, - help="Path to config file for training.", - required="--continue_path" not in argv) - parser.add_argument("--debug", - type=bool, - default=False, - help="Do not verify commit integrity to run training.") parser.add_argument( - "--rank", - type=int, - default=0, - help="DISTRIBUTED: process rank for distributed training.") - parser.add_argument("--group_id", - type=str, - default="", - help="DISTRIBUTED: process group id.") + "--config_path", type=str, help="Path to config file for training.", required="--continue_path" not in argv + ) + parser.add_argument("--debug", type=bool, default=False, help="Do not verify commit integrity to run training.") + parser.add_argument("--rank", type=int, default=0, help="DISTRIBUTED: process rank for distributed training.") + parser.add_argument("--group_id", type=str, default="", help="DISTRIBUTED: process group id.") return parser @@ -160,8 +149,7 @@ def process_args(args): print(" > Mixed precision mode is ON") experiment_path = args.continue_path if not experiment_path: - experiment_path = create_experiment_folder(config.output_path, - config.run_name, args.debug) + experiment_path = create_experiment_folder(config.output_path, config.run_name, args.debug) audio_path = os.path.join(experiment_path, "test_audios") # setup rank 0 process in distributed training tb_logger = None @@ -182,8 +170,7 @@ def process_args(args): os.chmod(experiment_path, 0o775) tb_logger = TensorboardLogger(experiment_path, model_name=config.model) # write model desc to tensorboard - tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", - 0) + tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) c_logger = ConsoleLogger() return config, experiment_path, audio_path, c_logger, tb_logger diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index a8332eb8..a31436d4 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -234,8 +234,8 @@ class Synthesizer(object): use_griffin_lim=use_gl, x_vector=speaker_embedding, ) - waveform = outputs['wav'] - mel_postnet_spec = outputs['model_outputs'] + waveform = outputs["wav"] + mel_postnet_spec = outputs["model_outputs"] if not use_gl: # denormalize tts output based on tts audio config mel_postnet_spec = self.ap.denormalize(mel_postnet_spec.T).T diff --git a/tests/tts_tests/test_align_tts_train.py b/tests/tts_tests/test_align_tts_train.py index 4bf3802f..3d802d5f 100644 --- a/tests/tts_tests/test_align_tts_train.py +++ b/tests/tts_tests/test_align_tts_train.py @@ -44,8 +44,6 @@ run_cli(command_train) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py index 6df25baa..bd119b9c 100644 --- a/tests/tts_tests/test_glow_tts_train.py +++ b/tests/tts_tests/test_glow_tts_train.py @@ -45,8 +45,6 @@ run_cli(command_train) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index e4413438..9977864e 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -45,8 +45,6 @@ run_cli(command_train) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py index aef507a5..0d9a67a5 100644 --- a/tests/tts_tests/test_tacotron2_train.py +++ b/tests/tts_tests/test_tacotron2_train.py @@ -45,8 +45,6 @@ run_cli(command_train) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_tacotron_train.py b/tests/tts_tests/test_tacotron_train.py index 771ad93c..52560715 100644 --- a/tests/tts_tests/test_tacotron_train.py +++ b/tests/tts_tests/test_tacotron_train.py @@ -44,8 +44,6 @@ run_cli(command_train) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py index e3004db7..3ff65b5a 100644 --- a/tests/vocoder_tests/test_melgan_train.py +++ b/tests/vocoder_tests/test_melgan_train.py @@ -21,7 +21,6 @@ config = MelganConfig( print_step=1, discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, print_eval=True, - discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, data_path="tests/data/ljspeech", output_path=output_path, ) From b4d4ce0d7e985b408d65cdd23879bc3d20fe9295 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 May 2021 13:32:08 +0200 Subject: [PATCH 039/258] remove redundant imports --- TTS/trainer.py | 5 ----- TTS/utils/{ => logging}/console_logger.py | 0 TTS/utils/{ => logging}/tensorboard_logger.py | 0 3 files changed, 5 deletions(-) rename TTS/utils/{ => logging}/console_logger.py (100%) rename TTS/utils/{ => logging}/tensorboard_logger.py (100%) diff --git a/TTS/trainer.py b/TTS/trainer.py index 06d5d6b5..7a31bb34 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -3,13 +3,8 @@ import importlib import logging import os -import sys import time -import traceback -from logging import StreamHandler -from random import randrange -import numpy as np import torch # DISTRIBUTED diff --git a/TTS/utils/console_logger.py b/TTS/utils/logging/console_logger.py similarity index 100% rename from TTS/utils/console_logger.py rename to TTS/utils/logging/console_logger.py diff --git a/TTS/utils/tensorboard_logger.py b/TTS/utils/logging/tensorboard_logger.py similarity index 100% rename from TTS/utils/tensorboard_logger.py rename to TTS/utils/logging/tensorboard_logger.py From 9bbc9243770225634b18a1e2a0372a3f95e354ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 May 2021 13:32:35 +0200 Subject: [PATCH 040/258] import missings --- TTS/trainer.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index 7a31bb34..372bb0f6 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -7,23 +7,29 @@ import time import torch +from coqpit import Coqpit +from dataclasses import dataclass, field +from typing import Tuple, Dict, List, Union + +from argparse import Namespace # DISTRIBUTED +from torch import nn from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler +from TTS.utils.logging import ConsoleLogger, TensorboardLogger from TTS.tts.datasets import TTSDataset, load_meta_data from TTS.tts.layers import setup_loss from TTS.tts.models import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.synthesis import synthesis -from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols +from TTS.tts.utils.text.symbols import make_symbols from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.arguments import init_training from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import init_distributed, reduce_tensor -from TTS.utils.generic_utils import KeepAverage, count_parameters, find_module, remove_experiment_folder, set_init_dict +from TTS.utils.distribute import init_distributed +from TTS.utils.generic_utils import KeepAverage, count_parameters, set_init_dict from TTS.utils.training import check_update, setup_torch_training_env From bb58a0588e033f35e547946fd01c0a4c5daa006a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 May 2021 13:34:15 +0200 Subject: [PATCH 041/258] fix logger imports --- TTS/utils/arguments.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 5e6acd1d..90abd3b5 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -11,10 +11,9 @@ import torch from TTS.config import load_config from TTS.tts.utils.text.symbols import parse_symbols -from TTS.utils.console_logger import ConsoleLogger +from TTS.utils.logging import ConsoleLogger, TensorboardLogger from TTS.utils.generic_utils import create_experiment_folder, get_git_branch from TTS.utils.io import copy_model_files -from TTS.utils.tensorboard_logger import TensorboardLogger def init_arguments(argv): From d376647ca077c876e8cead8ceb10727b5a8c4787 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 May 2021 13:35:54 +0200 Subject: [PATCH 042/258] `logging/__init__.py` --- TTS/utils/logging/__init__.py | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 TTS/utils/logging/__init__.py diff --git a/TTS/utils/logging/__init__.py b/TTS/utils/logging/__init__.py new file mode 100644 index 00000000..877131c4 --- /dev/null +++ b/TTS/utils/logging/__init__.py @@ -0,0 +1,2 @@ +from TTS.utils.logging.console_logger import ConsoleLogger +from TTS.utils.logging.tensorboard_logger import TensorboardLogger From ca787be19300ce2004d7f6e3eef4c56212b57b27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 May 2021 13:37:08 +0200 Subject: [PATCH 043/258] make style --- TTS/bin/convert_tacotron2_torch_to_tf.py | 2 +- TTS/bin/train_tts.py | 3 +- TTS/trainer.py | 218 +++++++++++------------ TTS/tts/datasets/__init__.py | 6 +- TTS/tts/datasets/formatters.py | 1 - TTS/tts/models/align_tts.py | 6 +- TTS/tts/models/glow_tts.py | 4 +- TTS/tts/models/speedy_speech.py | 6 +- TTS/tts/models/tacotron.py | 4 +- TTS/tts/models/tacotron2.py | 4 +- TTS/tts/utils/data.py | 2 +- TTS/tts/utils/speakers.py | 2 +- TTS/utils/arguments.py | 2 +- 13 files changed, 130 insertions(+), 130 deletions(-) diff --git a/TTS/bin/convert_tacotron2_torch_to_tf.py b/TTS/bin/convert_tacotron2_torch_to_tf.py index e7f991be..119529ae 100644 --- a/TTS/bin/convert_tacotron2_torch_to_tf.py +++ b/TTS/bin/convert_tacotron2_torch_to_tf.py @@ -8,10 +8,10 @@ import numpy as np import tensorflow as tf import torch +from TTS.tts.models import setup_model from TTS.tts.tf.models.tacotron2 import Tacotron2 from TTS.tts.tf.utils.convert_torch_to_tf_utils import compare_torch_tf, convert_tf_name, transfer_weights_torch_to_tf from TTS.tts.tf.utils.generic_utils import save_checkpoint -from TTS.tts.models import setup_model from TTS.tts.utils.text.symbols import phonemes, symbols from TTS.utils.io import load_config diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 607a4e3b..8182b23f 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -1,9 +1,10 @@ import os import sys import traceback + +from TTS.trainer import TrainerTTS from TTS.utils.arguments import init_training from TTS.utils.generic_utils import remove_experiment_folder -from TTS.trainer import TrainerTTS def main(): diff --git a/TTS/trainer.py b/TTS/trainer.py index 372bb0f6..cb905d3a 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -4,21 +4,19 @@ import importlib import logging import os import time +from argparse import Namespace +from dataclasses import dataclass, field +from typing import Dict, List, Tuple, Union import torch - from coqpit import Coqpit -from dataclasses import dataclass, field -from typing import Tuple, Dict, List, Union -from argparse import Namespace # DISTRIBUTED from torch import nn from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler -from TTS.utils.logging import ConsoleLogger, TensorboardLogger from TTS.tts.datasets import TTSDataset, load_meta_data from TTS.tts.layers import setup_loss from TTS.tts.models import setup_model @@ -30,49 +28,48 @@ from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor from TTS.utils.distribute import init_distributed from TTS.utils.generic_utils import KeepAverage, count_parameters, set_init_dict +from TTS.utils.logging import ConsoleLogger, TensorboardLogger from TTS.utils.training import check_update, setup_torch_training_env @dataclass class TrainingArgs(Coqpit): continue_path: str = field( - default='', + default="", metadata={ - 'help': - 'Path to a training folder to continue training. Restore the model from the last checkpoint and continue training under the same folder.' - }) + "help": "Path to a training folder to continue training. Restore the model from the last checkpoint and continue training under the same folder." + }, + ) restore_path: str = field( - default='', + default="", metadata={ - 'help': - 'Path to a model checkpoit. Restore the model with the given checkpoint and start a new training.' - }) + "help": "Path to a model checkpoit. Restore the model with the given checkpoint and start a new training." + }, + ) best_path: str = field( - default='', + default="", metadata={ - 'help': - "Best model file to be used for extracting best loss. If not specified, the latest best model in continue path is used" - }) - config_path: str = field( - default='', metadata={'help': 'Path to the configuration file.'}) - rank: int = field( - default=0, metadata={'help': 'Process rank in distributed training.'}) - group_id: str = field( - default='', - metadata={'help': 'Process group id in distributed training.'}) + "help": "Best model file to be used for extracting best loss. If not specified, the latest best model in continue path is used" + }, + ) + config_path: str = field(default="", metadata={"help": "Path to the configuration file."}) + rank: int = field(default=0, metadata={"help": "Process rank in distributed training."}) + group_id: str = field(default="", metadata={"help": "Process group id in distributed training."}) # pylint: disable=import-outside-toplevel, too-many-public-methods class TrainerTTS: use_cuda, num_gpus = setup_torch_training_env(True, False) - def __init__(self, - args: Union[Coqpit, Namespace], - config: Coqpit, - c_logger: ConsoleLogger, - tb_logger: TensorboardLogger, - model: nn.Module = None, - output_path: str = None) -> None: + def __init__( + self, + args: Union[Coqpit, Namespace], + config: Coqpit, + c_logger: ConsoleLogger, + tb_logger: TensorboardLogger, + model: nn.Module = None, + output_path: str = None, + ) -> None: self.args = args self.config = config self.c_logger = c_logger @@ -90,8 +87,7 @@ class TrainerTTS: self.keep_avg_train = None self.keep_avg_eval = None - log_file = os.path.join(self.output_path, - f"trainer_{args.rank}_log.txt") + log_file = os.path.join(self.output_path, f"trainer_{args.rank}_log.txt") self._setup_logger_config(log_file) # model, audio processor, datasets, loss @@ -106,16 +102,19 @@ class TrainerTTS: # default speaker manager self.speaker_manager = self.get_speaker_manager( - self.config, args.restore_path, self.config.output_path, self.data_train) + self.config, args.restore_path, self.config.output_path, self.data_train + ) # init TTS model if model is not None: self.model = model else: self.model = self.get_model( - len(self.model_characters), self.speaker_manager.num_speakers, - self.config, self.speaker_manager.x_vector_dim - if self.speaker_manager.x_vectors else None) + len(self.model_characters), + self.speaker_manager.num_speakers, + self.config, + self.speaker_manager.x_vector_dim if self.speaker_manager.x_vectors else None, + ) # setup criterion self.criterion = self.get_criterion(self.config) @@ -126,13 +125,16 @@ class TrainerTTS: # DISTRUBUTED if self.num_gpus > 1: - init_distributed(args.rank, self.num_gpus, args.group_id, - self.config.distributed["backend"], - self.config.distributed["url"]) + init_distributed( + args.rank, + self.num_gpus, + args.group_id, + self.config.distributed["backend"], + self.config.distributed["url"], + ) # scalers for mixed precision training - self.scaler = torch.cuda.amp.GradScaler( - ) if self.config.mixed_precision and self.use_cuda else None + self.scaler = torch.cuda.amp.GradScaler() if self.config.mixed_precision and self.use_cuda else None # setup optimizer self.optimizer = self.get_optimizer(self.model, self.config) @@ -154,8 +156,7 @@ class TrainerTTS: print("\n > Model has {} parameters".format(num_params)) @staticmethod - def get_model(num_chars: int, num_speakers: int, config: Coqpit, - x_vector_dim: int) -> nn.Module: + def get_model(num_chars: int, num_speakers: int, config: Coqpit, x_vector_dim: int) -> nn.Module: model = setup_model(num_chars, num_speakers, config, x_vector_dim) return model @@ -182,26 +183,32 @@ class TrainerTTS: return model_characters @staticmethod - def get_speaker_manager(config: Coqpit, - restore_path: str = "", - out_path: str = "", - data_train: List = []) -> SpeakerManager: + def get_speaker_manager( + config: Coqpit, restore_path: str = "", out_path: str = "", data_train: List = [] + ) -> SpeakerManager: speaker_manager = SpeakerManager() if restore_path: - speakers_file = os.path.join(os.path.dirname(restore_path), - "speaker.json") + speakers_file = os.path.join(os.path.dirname(restore_path), "speaker.json") if not os.path.exists(speakers_file): print( "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" ) speakers_file = config.external_speaker_embedding_file + if config.use_external_speaker_embedding_file: + speaker_manager.load_x_vectors_file(speakers_file) + else: + speaker_manager.load_ids_file(speakers_file) + elif config.use_external_speaker_embedding_file and config.external_speaker_embedding_file: + speaker_manager.load_x_vectors_file(config.external_speaker_embedding_file) + else: + speaker_manager.parse_speakers_from_items(data_train) + file_path = os.path.join(out_path, "speakers.json") speaker_manager.save_ids_file(file_path) return speaker_manager @staticmethod - def get_scheduler(config: Coqpit, - optimizer: torch.optim.Optimizer) -> torch.optim.lr_scheduler._LRScheduler: + def get_scheduler(config: Coqpit, optimizer: torch.optim.Optimizer) -> torch.optim.lr_scheduler._LRScheduler: lr_scheduler = config.lr_scheduler lr_scheduler_params = config.lr_scheduler_params if lr_scheduler is None: @@ -224,7 +231,7 @@ class TrainerTTS: restore_path: str, model: nn.Module, optimizer: torch.optim.Optimizer, - scaler: torch.cuda.amp.GradScaler = None + scaler: torch.cuda.amp.GradScaler = None, ) -> Tuple[nn.Module, torch.optim.Optimizer, torch.cuda.amp.GradScaler, int]: print(" > Restoring from %s ..." % os.path.basename(restore_path)) checkpoint = torch.load(restore_path) @@ -245,13 +252,21 @@ class TrainerTTS: for group in optimizer.param_groups: group["lr"] = self.config.lr - print(" > Model restored from step %d" % checkpoint["step"], ) + print( + " > Model restored from step %d" % checkpoint["step"], + ) restore_step = checkpoint["step"] return model, optimizer, scaler, restore_step - def _get_loader(self, r: int, ap: AudioProcessor, is_eval: bool, - data_items: List, verbose: bool, - speaker_mapping: Union[Dict, List]) -> DataLoader: + def _get_loader( + self, + r: int, + ap: AudioProcessor, + is_eval: bool, + data_items: List, + verbose: bool, + speaker_mapping: Union[Dict, List], + ) -> DataLoader: if is_eval and not self.config.run_eval: loader = None else: @@ -295,17 +310,15 @@ class TrainerTTS: ) return loader - def get_train_dataloader(self, r: int, ap: AudioProcessor, - data_items: List, verbose: bool, - speaker_mapping: Union[List, Dict]) -> DataLoader: - return self._get_loader(r, ap, False, data_items, verbose, - speaker_mapping) + def get_train_dataloader( + self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_mapping: Union[List, Dict] + ) -> DataLoader: + return self._get_loader(r, ap, False, data_items, verbose, speaker_mapping) - def get_eval_dataloder(self, r: int, ap: AudioProcessor, data_items: List, - verbose: bool, - speaker_mapping: Union[List, Dict]) -> DataLoader: - return self._get_loader(r, ap, True, data_items, verbose, - speaker_mapping) + def get_eval_dataloder( + self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_mapping: Union[List, Dict] + ) -> DataLoader: + return self._get_loader(r, ap, True, data_items, verbose, speaker_mapping) def format_batch(self, batch: List) -> Dict: # setup input batch @@ -390,8 +403,7 @@ class TrainerTTS: "item_idx": item_idx, } - def train_step(self, batch: Dict, batch_n_steps: int, step: int, - loader_start_time: float) -> Tuple[Dict, Dict]: + def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_time: float) -> Tuple[Dict, Dict]: self.on_train_step_start() step_start_time = time.time() @@ -560,7 +572,9 @@ class TrainerTTS: self.tb_logger.tb_eval_figures(self.total_steps_done, figures) self.tb_logger.tb_eval_audios(self.total_steps_done, {"EvalAudio": eval_audios}, self.ap.sample_rate) - def test_run(self, ) -> None: + def test_run( + self, + ) -> None: print(" | > Synthesizing test sentences.") test_audios = {} test_figures = {} @@ -581,28 +595,26 @@ class TrainerTTS: do_trim_silence=False, ).values() - file_path = os.path.join(self.output_audio_path, - str(self.total_steps_done)) + file_path = os.path.join(self.output_audio_path, str(self.total_steps_done)) os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, - "TestSentence_{}.wav".format(idx)) + file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) self.ap.save_wav(wav, file_path) test_audios["{}-audio".format(idx)] = wav test_figures["{}-prediction".format(idx)] = plot_spectrogram(model_outputs, self.ap, output_fig=False) test_figures["{}-alignment".format(idx)] = plot_alignment(alignment, output_fig=False) - self.tb_logger.tb_test_audios(self.total_steps_done, test_audios, - self.config.audio["sample_rate"]) + self.tb_logger.tb_test_audios(self.total_steps_done, test_audios, self.config.audio["sample_rate"]) self.tb_logger.tb_test_figures(self.total_steps_done, test_figures) def _get_cond_inputs(self) -> Dict: # setup speaker_id speaker_id = 0 if self.config.use_speaker_embedding else None # setup x_vector - x_vector = (self.speaker_manager.get_x_vectors_by_speaker( - self.speaker_manager.speaker_ids[0]) - if self.config.use_external_speaker_embedding_file - and self.config.use_speaker_embedding else None) + x_vector = ( + self.speaker_manager.get_x_vectors_by_speaker(self.speaker_manager.speaker_ids[0]) + if self.config.use_external_speaker_embedding_file and self.config.use_speaker_embedding + else None + ) # setup style_mel if self.config.has("gst_style_input"): style_wav = self.config.gst_style_input @@ -611,40 +623,29 @@ class TrainerTTS: if style_wav is None and "use_gst" in self.config and self.config.use_gst: # inicialize GST with zero dict. style_wav = {} - print( - "WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!" - ) + print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") for i in range(self.config.gst["gst_num_style_tokens"]): style_wav[str(i)] = 0 - cond_inputs = { - "speaker_id": speaker_id, - "style_wav": style_wav, - "x_vector": x_vector - } + cond_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "x_vector": x_vector} return cond_inputs def fit(self) -> None: if self.restore_step != 0 or self.args.best_path: - print(" > Restoring best loss from " - f"{os.path.basename(self.args.best_path)} ...") - self.best_loss = torch.load(self.args.best_path, - map_location="cpu")["model_loss"] + print(" > Restoring best loss from " f"{os.path.basename(self.args.best_path)} ...") + self.best_loss = torch.load(self.args.best_path, map_location="cpu")["model_loss"] print(f" > Starting with loaded last best loss {self.best_loss}.") # define data loaders self.train_loader = self.get_train_dataloader( - self.config.r, - self.ap, - self.data_train, - verbose=True, - speaker_mapping=self.speaker_manager.speaker_ids) - self.eval_loader = (self.get_eval_dataloder( - self.config.r, - self.ap, - self.data_train, - verbose=True, - speaker_mapping=self.speaker_manager.speaker_ids) - if self.config.run_eval else None) + self.config.r, self.ap, self.data_train, verbose=True, speaker_mapping=self.speaker_manager.speaker_ids + ) + self.eval_loader = ( + self.get_eval_dataloder( + self.config.r, self.ap, self.data_train, verbose=True, speaker_mapping=self.speaker_manager.speaker_ids + ) + if self.config.run_eval + else None + ) self.total_steps_done = self.restore_step @@ -667,8 +668,7 @@ class TrainerTTS: def save_best_model(self) -> None: self.best_loss = save_best_model( - self.keep_avg_eval["avg_loss"] - if self.keep_avg_eval else self.keep_avg_train["avg_loss"], + self.keep_avg_eval["avg_loss"] if self.keep_avg_eval else self.keep_avg_train["avg_loss"], self.best_loss, self.model, self.optimizer, @@ -685,10 +685,8 @@ class TrainerTTS: @staticmethod def _setup_logger_config(log_file: str) -> None: logging.basicConfig( - level=logging.INFO, - format="", - handlers=[logging.FileHandler(log_file), - logging.StreamHandler()]) + level=logging.INFO, format="", handlers=[logging.FileHandler(log_file), logging.StreamHandler()] + ) def on_epoch_start(self) -> None: # pylint: disable=no-self-use if hasattr(self.model, "on_epoch_start"): diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index 69ab871d..bcdbf6a6 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -1,9 +1,11 @@ import sys -import numpy as np from collections import Counter from pathlib import Path -from TTS.tts.datasets.TTSDataset import TTSDataset + +import numpy as np + from TTS.tts.datasets.formatters import * +from TTS.tts.datasets.TTSDataset import TTSDataset #################### # UTILITIES diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index f43733b1..815a1b1d 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -7,7 +7,6 @@ from typing import List from tqdm import tqdm - ######################## # DATASETS ######################## diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index f94d9ca6..e8f80251 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -4,13 +4,13 @@ import torch.nn as nn from TTS.tts.layers.align_tts.mdn import MDNBlock from TTS.tts.layers.feed_forward.decoder import Decoder from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor -from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.audio import AudioProcessor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor class AlignTTS(nn.Module): diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index e1c07212..8cf19f79 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -6,11 +6,11 @@ from torch.nn import functional as F from TTS.tts.layers.glow_tts.decoder import Decoder from TTS.tts.layers.glow_tts.encoder import Encoder +from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path +from TTS.tts.utils.data import sequence_mask from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor -from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path -from TTS.tts.utils.data import sequence_mask class GlowTTS(nn.Module): diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index 69070ffa..f00af9ad 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -3,13 +3,13 @@ from torch import nn from TTS.tts.layers.feed_forward.decoder import Decoder from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor -from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.audio import AudioProcessor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor class SpeedySpeech(nn.Module): diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 19af28ff..6059a0d2 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -2,11 +2,11 @@ import torch from torch import nn -from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.tts.layers.tacotron.gst_layers import GST from TTS.tts.layers.tacotron.tacotron import Decoder, Encoder, PostCBHG from TTS.tts.models.tacotron_abstract import TacotronAbstract +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram class Tacotron(TacotronAbstract): diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 4e111fda..b39a9d6f 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -3,11 +3,11 @@ import numpy as np import torch from torch import nn -from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.tts.layers.tacotron.gst_layers import GST from TTS.tts.layers.tacotron.tacotron2 import Decoder, Encoder, Postnet from TTS.tts.models.tacotron_abstract import TacotronAbstract +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram class Tacotron2(TacotronAbstract): diff --git a/TTS/tts/utils/data.py b/TTS/tts/utils/data.py index 5f8624e6..3ff52195 100644 --- a/TTS/tts/utils/data.py +++ b/TTS/tts/utils/data.py @@ -1,5 +1,5 @@ -import torch import numpy as np +import torch def _pad_data(x, length): diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 374139ee..4bfe8299 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -1,7 +1,7 @@ import json import os import random -from typing import Union, List, Any +from typing import Any, List, Union import numpy as np import torch diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 90abd3b5..9d92ae82 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -11,9 +11,9 @@ import torch from TTS.config import load_config from TTS.tts.utils.text.symbols import parse_symbols -from TTS.utils.logging import ConsoleLogger, TensorboardLogger from TTS.utils.generic_utils import create_experiment_folder, get_git_branch from TTS.utils.io import copy_model_files +from TTS.utils.logging import ConsoleLogger, TensorboardLogger def init_arguments(argv): From 79f7c5da1e91fc9f8cc9d26e40f754c4bb04737c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 May 2021 14:03:02 +0200 Subject: [PATCH 044/258] delete separate tts training scripts and pre-commit configuration --- .pre-commit-config.yaml | 17 +- TTS/bin/train_align_tts.py | 572 ------------------------- TTS/bin/train_glow_tts.py | 598 -------------------------- TTS/bin/train_speedy_speech.py | 578 ------------------------- TTS/bin/train_tacotron.py | 749 --------------------------------- 5 files changed, 16 insertions(+), 2498 deletions(-) delete mode 100644 TTS/bin/train_align_tts.py delete mode 100644 TTS/bin/train_glow_tts.py delete mode 100644 TTS/bin/train_speedy_speech.py delete mode 100755 TTS/bin/train_tacotron.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1ae28644..a70572dc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,4 +9,19 @@ repos: rev: 20.8b1 hooks: - id: black - language_version: python3 \ No newline at end of file + language_version: python3 + - repo: https://github.com/pycqa/isort + rev: 5.8.0 + hooks: + - id: isort + name: isort (python) + - id: isort + name: isort (cython) + types: [cython] + - id: isort + name: isort (pyi) + types: [pyi] + - repo: https://github.com/pycqa/pylint + rev: v2.8.2 + hooks: + - id: pylint diff --git a/TTS/bin/train_align_tts.py b/TTS/bin/train_align_tts.py deleted file mode 100644 index 34eba7a8..00000000 --- a/TTS/bin/train_align_tts.py +++ /dev/null @@ -1,572 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import os -import sys -import time -import traceback -from random import randrange - -import numpy as np -import torch -from torch.nn.parallel import DistributedDataParallel as DDP_th -from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler - -from TTS.tts.datasets import load_meta_data -from TTS.tts.datasets.TTSDataset import TTSDataset -from TTS.tts.layers.losses import AlignTTSLoss -from TTS.tts.models import setup_model -from TTS.tts.utils.io import save_best_model, save_checkpoint -from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.speakers import parse_speakers -from TTS.tts.utils.synthesis import synthesis -from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.arguments import init_training -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import init_distributed, reduce_tensor -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.radam import RAdam -from TTS.utils.training import NoamLR, setup_torch_training_env - -use_cuda, num_gpus = setup_torch_training_env(True, False) -# torch.autograd.set_detect_anomaly(True) - - -def setup_loader(ap, r, is_val=False, verbose=False): - if is_val and not config.run_eval: - loader = None - else: - dataset = TTSDataset( - r, - config.text_cleaner, - compute_linear_spec=False, - meta_data=meta_data_eval if is_val else meta_data_train, - ap=ap, - tp=config.characters, - add_blank=config["add_blank"], - batch_group_size=0 if is_val else config.batch_group_size * config.batch_size, - min_seq_len=config.min_seq_len, - max_seq_len=config.max_seq_len, - phoneme_cache_path=config.phoneme_cache_path, - use_phonemes=config.use_phonemes, - phoneme_language=config.phoneme_language, - enable_eos_bos=config.enable_eos_bos_chars, - use_noise_augment=not is_val, - verbose=verbose, - speaker_mapping=speaker_mapping - if config.use_speaker_embedding and config.use_external_speaker_embedding_file - else None, - ) - - if config.use_phonemes and config.compute_input_seq_cache: - # precompute phonemes to have a better estimate of sequence lengths. - dataset.compute_input_seq(config.num_loader_workers) - dataset.sort_items() - - sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=config.eval_batch_size if is_val else config.batch_size, - shuffle=False, - collate_fn=dataset.collate_fn, - drop_last=False, - sampler=sampler, - num_workers=config.num_val_loader_workers if is_val else config.num_loader_workers, - pin_memory=False, - ) - return loader - - -def format_data(data): - # setup input data - text_input = data[0] - text_lengths = data[1] - speaker_names = data[2] - mel_input = data[4].permute(0, 2, 1) # B x D x T - mel_lengths = data[5] - item_idx = data[7] - avg_text_length = torch.mean(text_lengths.float()) - avg_spec_length = torch.mean(mel_lengths.float()) - - if config.use_speaker_embedding: - if config.use_external_speaker_embedding_file: - # return precomputed embedding vector - speaker_c = data[8] - else: - # return speaker_id to be used by an embedding layer - speaker_c = [speaker_mapping[speaker_name] for speaker_name in speaker_names] - speaker_c = torch.LongTensor(speaker_c) - else: - speaker_c = None - # dispatch data to GPU - if use_cuda: - text_input = text_input.cuda(non_blocking=True) - text_lengths = text_lengths.cuda(non_blocking=True) - mel_input = mel_input.cuda(non_blocking=True) - mel_lengths = mel_lengths.cuda(non_blocking=True) - if speaker_c is not None: - speaker_c = speaker_c.cuda(non_blocking=True) - return text_input, text_lengths, mel_input, mel_lengths, speaker_c, avg_text_length, avg_spec_length, item_idx - - -def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch, training_phase): - - model.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (config.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / config.batch_size) - end_time = time.time() - c_logger.print_train_start() - scaler = torch.cuda.amp.GradScaler() if config.mixed_precision else None - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - ( - text_input, - text_lengths, - mel_targets, - mel_lengths, - speaker_c, - avg_text_length, - avg_spec_length, - _, - ) = format_data(data) - - loader_time = time.time() - end_time - - global_step += 1 - optimizer.zero_grad() - - # forward pass model - with torch.cuda.amp.autocast(enabled=config.mixed_precision): - decoder_output, dur_output, dur_mas_output, alignments, _, _, logp = model.forward( - text_input, text_lengths, mel_targets, mel_lengths, g=speaker_c, phase=training_phase - ) - - # compute loss - loss_dict = criterion( - logp, - decoder_output, - mel_targets, - mel_lengths, - dur_output, - dur_mas_output, - text_lengths, - global_step, - phase=training_phase, - ) - - # backward pass with loss scaling - if config.mixed_precision: - scaler.scale(loss_dict["loss"]).backward() - scaler.unscale_(optimizer) - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) - scaler.step(optimizer) - scaler.update() - else: - loss_dict["loss"].backward() - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) - optimizer.step() - - # setup lr - if config.noam_schedule: - scheduler.step() - - # current_lr - current_lr = optimizer.param_groups[0]["lr"] - - # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(alignments, binary=True) - loss_dict["align_error"] = align_error - - step_time = time.time() - start_time - epoch_time += step_time - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["loss_l1"] = reduce_tensor(loss_dict["loss_l1"].data, num_gpus) - loss_dict["loss_ssim"] = reduce_tensor(loss_dict["loss_ssim"].data, num_gpus) - loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - keep_avg.update_values(update_train_values) - - # print training progress - if global_step % config.print_step == 0: - log_dict = { - "avg_spec_length": [avg_spec_length, 1], # value, precision - "avg_text_length": [avg_text_length, 1], - "step_time": [step_time, 4], - "loader_time": [loader_time, 2], - "current_lr": current_lr, - } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Plot Training Iter Stats - # reduce TB load - if global_step % config.tb_plot_step == 0: - iter_stats = {"lr": current_lr, "grad_norm": grad_norm, "step_time": step_time} - iter_stats.update(loss_dict) - tb_logger.tb_train_step_stats(global_step, iter_stats) - - if global_step % config.save_step == 0: - if config.checkpoint: - # save model - save_checkpoint( - model, - optimizer, - global_step, - epoch, - 1, - OUT_PATH, - model_characters, - model_loss=loss_dict["loss"], - ) - - # wait all kernels to be completed - torch.cuda.synchronize() - - # Diagnostic visualizations - if decoder_output is not None: - idx = np.random.randint(mel_targets.shape[0]) - pred_spec = decoder_output[idx].detach().data.cpu().numpy().T - gt_spec = mel_targets[idx].data.cpu().numpy().T - align_img = alignments[idx].data.cpu() - - figures = { - "prediction": plot_spectrogram(pred_spec, ap), - "ground_truth": plot_spectrogram(gt_spec, ap), - "alignment": plot_alignment(align_img), - } - - tb_logger.tb_train_figures(global_step, figures) - - # Sample audio - train_audio = ap.inv_melspectrogram(pred_spec.T) - tb_logger.tb_train_audios(global_step, {"TrainAudio": train_audio}, config.audio["sample_rate"]) - end_time = time.time() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Epoch Stats - if args.rank == 0: - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - if config.tb_model_param_stats: - tb_logger.tb_model_weights(model, global_step) - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(data_loader, model, criterion, ap, global_step, epoch, training_phase): - model.eval() - epoch_time = 0 - keep_avg = KeepAverage() - c_logger.print_eval_start() - if data_loader is not None: - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - text_input, text_lengths, mel_targets, mel_lengths, speaker_c, _, _, _ = format_data(data) - - # forward pass model - with torch.cuda.amp.autocast(enabled=config.mixed_precision): - decoder_output, dur_output, dur_mas_output, alignments, _, _, logp = model.forward( - text_input, text_lengths, mel_targets, mel_lengths, g=speaker_c, phase=training_phase - ) - - # compute loss - loss_dict = criterion( - logp, - decoder_output, - mel_targets, - mel_lengths, - dur_output, - dur_mas_output, - text_lengths, - global_step, - phase=training_phase, - ) - - # step time - step_time = time.time() - start_time - epoch_time += step_time - - # compute alignment score - align_error = 1 - alignment_diagonal_score(alignments, binary=True) - loss_dict["align_error"] = align_error - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["loss_l1"] = reduce_tensor(loss_dict["loss_l1"].data, num_gpus) - loss_dict["loss_ssim"] = reduce_tensor(loss_dict["loss_ssim"].data, num_gpus) - loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - keep_avg.update_values(update_train_values) - - if config.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Diagnostic visualizations - idx = np.random.randint(mel_targets.shape[0]) - pred_spec = decoder_output[idx].detach().data.cpu().numpy().T - gt_spec = mel_targets[idx].data.cpu().numpy().T - align_img = alignments[idx].data.cpu() - - eval_figures = { - "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), - "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), - "alignment": plot_alignment(align_img, output_fig=False), - } - - # Sample audio - eval_audio = ap.inv_melspectrogram(pred_spec.T) - tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, config.audio["sample_rate"]) - - # Plot Validation Stats - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - tb_logger.tb_eval_figures(global_step, eval_figures) - - if args.rank == 0 and epoch >= config.test_delay_epochs: - if config.test_sentences_file: - with open(config.test_sentences_file, "r") as f: - test_sentences = [s.strip() for s in f.readlines()] - else: - test_sentences = [ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963.", - ] - - # test sentences - test_audios = {} - test_figures = {} - print(" | > Synthesizing test sentences") - if config.use_speaker_embedding: - if config.use_external_speaker_embedding_file: - speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[randrange(len(speaker_mapping) - 1)]][ - "embedding" - ] - speaker_id = None - else: - speaker_id = 0 - speaker_embedding = None - else: - speaker_id = None - speaker_embedding = None - - for idx, test_sentence in enumerate(test_sentences): - try: - wav, alignment, _, postnet_output, _, _ = synthesis( - model, - test_sentence, - config, - use_cuda, - ap, - speaker_id=speaker_id, - speaker_embedding=speaker_embedding, - style_wav=None, - truncated=False, - enable_eos_bos_chars=config.enable_eos_bos_chars, # pylint: disable=unused-argument - use_griffin_lim=True, - do_trim_silence=False, - ) - - file_path = os.path.join(AUDIO_PATH, str(global_step)) - os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) - ap.save_wav(wav, file_path) - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-prediction".format(idx)] = plot_spectrogram(postnet_output, ap) - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment) - except: # pylint: disable=bare-except - print(" !! Error creating Test Sentence -", idx) - traceback.print_exc() - tb_logger.tb_test_audios(global_step, test_audios, config.audio["sample_rate"]) - tb_logger.tb_test_figures(global_step, test_figures) - return keep_avg.avg_values - - -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global meta_data_train, meta_data_eval, symbols, phonemes, model_characters, speaker_mapping - # Audio processor - ap = AudioProcessor(**config.audio.to_dict()) - if config.has("characters") and config.characters: - symbols, phonemes = make_symbols(**config.characters.to_dict()) - - # DISTRUBUTED - if num_gpus > 1: - init_distributed(args.rank, num_gpus, args.group_id, config.distributed["backend"], config.distributed["url"]) - - # set model characters - model_characters = phonemes if config.use_phonemes else symbols - num_chars = len(model_characters) - - # load data instances - meta_data_train, meta_data_eval = load_meta_data(config.datasets, eval_split=True) - - # parse speakers - num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(config, args, meta_data_train, OUT_PATH) - - # setup model - model = setup_model(num_chars, num_speakers, config, speaker_embedding_dim=speaker_embedding_dim) - optimizer = RAdam(model.parameters(), lr=config.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9) - criterion = AlignTTSLoss(config) - - if args.restore_path: - print(f" > Restoring from {os.path.basename(args.restore_path)} ...") - checkpoint = torch.load(args.restore_path, map_location="cpu") - try: - # TODO: fix optimizer init, model.cuda() needs to be called before - # optimizer restore - optimizer.load_state_dict(checkpoint["optimizer"]) - if config.reinit_layers: - raise RuntimeError - model.load_state_dict(checkpoint["model"]) - except: # pylint: disable=bare-except - print(" > Partial model initialization.") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], config) - model.load_state_dict(model_dict) - del model_dict - - for group in optimizer.param_groups: - group["initial_lr"] = config.lr - print(" > Model restored from step %d" % checkpoint["step"], flush=True) - args.restore_step = checkpoint["step"] - else: - args.restore_step = 0 - - if use_cuda: - model.cuda() - criterion.cuda() - - # DISTRUBUTED - if num_gpus > 1: - model = DDP_th(model, device_ids=[args.rank]) - - if config.noam_schedule: - scheduler = NoamLR(optimizer, warmup_steps=config.warmup_steps, last_epoch=args.restore_step - 1) - else: - scheduler = None - - num_params = count_parameters(model) - print("\n > Model has {} parameters".format(num_params), flush=True) - - if args.restore_step == 0 or not args.best_path: - best_loss = float("inf") - print(" > Starting with inf best loss.") - else: - print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {best_loss}.") - keep_all_best = config.keep_all_best - keep_after = config.keep_after # void if keep_all_best False - - # define dataloaders - train_loader = setup_loader(ap, 1, is_val=False, verbose=True) - eval_loader = setup_loader(ap, 1, is_val=True, verbose=True) - - global_step = args.restore_step - - def set_phase(): - """Set AlignTTS training phase""" - if isinstance(config.phase_start_steps, list): - vals = [i < global_step for i in config.phase_start_steps] - if not True in vals: - phase = 0 - else: - phase = ( - len(config.phase_start_steps) - - [i < global_step for i in config.phase_start_steps][::-1].index(True) - - 1 - ) - else: - phase = None - return phase - - for epoch in range(0, config.epochs): - cur_phase = set_phase() - print(f"\n > Current AlignTTS phase: {cur_phase}") - c_logger.print_epoch_start(epoch, config.epochs) - train_avg_loss_dict, global_step = train( - train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch, cur_phase - ) - eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch, cur_phase) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = train_avg_loss_dict["avg_loss"] - if config.run_eval: - target_loss = eval_avg_loss_dict["avg_loss"] - best_loss = save_best_model( - target_loss, - best_loss, - model, - optimizer, - global_step, - epoch, - 1, - OUT_PATH, - model_characters, - keep_all_best=keep_all_best, - keep_after=keep_after, - ) - - -if __name__ == "__main__": - args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py deleted file mode 100644 index a138abeb..00000000 --- a/TTS/bin/train_glow_tts.py +++ /dev/null @@ -1,598 +0,0 @@ -#!/usr/bin/env python3 -"""Train Glow TTS model.""" - -import os -import sys -import time -import traceback -from random import randrange - -import torch - -# DISTRIBUTED -from torch.nn.parallel import DistributedDataParallel as DDP_th -from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler - -from TTS.tts.datasets import load_meta_data -from TTS.tts.datasets.TTSDataset import TTSDataset -from TTS.tts.layers.losses import GlowTTSLoss -from TTS.tts.models import setup_model -from TTS.tts.utils.io import save_best_model, save_checkpoint -from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.speakers import parse_speakers -from TTS.tts.utils.synthesis import synthesis -from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.arguments import init_training -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import init_distributed, reduce_tensor -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.radam import RAdam -from TTS.utils.training import NoamLR, setup_torch_training_env - -use_cuda, num_gpus = setup_torch_training_env(True, False) - - -def setup_loader(ap, r, is_val=False, verbose=False): - if is_val and not config.run_eval: - loader = None - else: - dataset = TTSDataset( - r, - config.text_cleaner, - compute_linear_spec=False, - meta_data=meta_data_eval if is_val else meta_data_train, - ap=ap, - tp=config.characters, - add_blank=config["add_blank"], - batch_group_size=0 if is_val else config.batch_group_size * config.batch_size, - min_seq_len=config.min_seq_len, - max_seq_len=config.max_seq_len, - phoneme_cache_path=config.phoneme_cache_path, - use_phonemes=config.use_phonemes, - phoneme_language=config.phoneme_language, - enable_eos_bos=config.enable_eos_bos_chars, - use_noise_augment=not is_val, - verbose=verbose, - speaker_mapping=speaker_mapping - if config.use_speaker_embedding and config.use_external_speaker_embedding_file - else None, - ) - - if config.use_phonemes and config.compute_input_seq_cache: - # precompute phonemes to have a better estimate of sequence lengths. - dataset.compute_input_seq(config.num_loader_workers) - dataset.sort_items() - - sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=config.eval_batch_size if is_val else config.batch_size, - shuffle=False, - collate_fn=dataset.collate_fn, - drop_last=False, - sampler=sampler, - num_workers=config.num_val_loader_workers if is_val else config.num_loader_workers, - pin_memory=False, - ) - return loader - - -def format_data(data): - # setup input data - text_input = data[0] - text_lengths = data[1] - speaker_names = data[2] - mel_input = data[4].permute(0, 2, 1) # B x D x T - mel_lengths = data[5] - item_idx = data[7] - attn_mask = data[9] - avg_text_length = torch.mean(text_lengths.float()) - avg_spec_length = torch.mean(mel_lengths.float()) - - if config.use_speaker_embedding: - if config.use_external_speaker_embedding_file: - # return precomputed embedding vector - speaker_c = data[8] - else: - # return speaker_id to be used by an embedding layer - speaker_c = [speaker_mapping[speaker_name] for speaker_name in speaker_names] - speaker_c = torch.LongTensor(speaker_c) - else: - speaker_c = None - - # dispatch data to GPU - if use_cuda: - text_input = text_input.cuda(non_blocking=True) - text_lengths = text_lengths.cuda(non_blocking=True) - mel_input = mel_input.cuda(non_blocking=True) - mel_lengths = mel_lengths.cuda(non_blocking=True) - if speaker_c is not None: - speaker_c = speaker_c.cuda(non_blocking=True) - if attn_mask is not None: - attn_mask = attn_mask.cuda(non_blocking=True) - return ( - text_input, - text_lengths, - mel_input, - mel_lengths, - speaker_c, - avg_text_length, - avg_spec_length, - attn_mask, - item_idx, - ) - - -def data_depended_init(data_loader, model): - """Data depended initialization for activation normalization.""" - if hasattr(model, "module"): - for f in model.module.decoder.flows: - if getattr(f, "set_ddi", False): - f.set_ddi(True) - else: - for f in model.decoder.flows: - if getattr(f, "set_ddi", False): - f.set_ddi(True) - - model.train() - print(" > Data depended initialization ... ") - num_iter = 0 - with torch.no_grad(): - for _, data in enumerate(data_loader): - - # format data - text_input, text_lengths, mel_input, mel_lengths, spekaer_embed, _, _, attn_mask, _ = format_data(data) - - # forward pass model - _ = model.forward(text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=spekaer_embed) - if num_iter == config.data_dep_init_steps: - break - num_iter += 1 - - if hasattr(model, "module"): - for f in model.module.decoder.flows: - if getattr(f, "set_ddi", False): - f.set_ddi(False) - else: - for f in model.decoder.flows: - if getattr(f, "set_ddi", False): - f.set_ddi(False) - return model - - -def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch): - - model.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (config.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / config.batch_size) - end_time = time.time() - c_logger.print_train_start() - scaler = torch.cuda.amp.GradScaler() if config.mixed_precision else None - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - ( - text_input, - text_lengths, - mel_input, - mel_lengths, - speaker_c, - avg_text_length, - avg_spec_length, - attn_mask, - _, - ) = format_data(data) - - loader_time = time.time() - end_time - - global_step += 1 - optimizer.zero_grad() - - # forward pass model - with torch.cuda.amp.autocast(enabled=config.mixed_precision): - z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward( - text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_c - ) - - # compute loss - loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, o_dur_log, o_total_dur, text_lengths) - - # backward pass with loss scaling - if config.mixed_precision: - scaler.scale(loss_dict["loss"]).backward() - scaler.unscale_(optimizer) - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) - scaler.step(optimizer) - scaler.update() - else: - loss_dict["loss"].backward() - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) - optimizer.step() - - # setup lr - if config.noam_schedule: - scheduler.step() - - # current_lr - current_lr = optimizer.param_groups[0]["lr"] - - # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(alignments, binary=True) - loss_dict["align_error"] = align_error - - step_time = time.time() - start_time - epoch_time += step_time - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["log_mle"] = reduce_tensor(loss_dict["log_mle"].data, num_gpus) - loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - keep_avg.update_values(update_train_values) - - # print training progress - if global_step % config.print_step == 0: - log_dict = { - "avg_spec_length": [avg_spec_length, 1], # value, precision - "avg_text_length": [avg_text_length, 1], - "step_time": [step_time, 4], - "loader_time": [loader_time, 2], - "current_lr": current_lr, - } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Plot Training Iter Stats - # reduce TB load - if global_step % config.tb_plot_step == 0: - iter_stats = {"lr": current_lr, "grad_norm": grad_norm, "step_time": step_time} - iter_stats.update(loss_dict) - tb_logger.tb_train_step_stats(global_step, iter_stats) - - if global_step % config.save_step == 0: - if config.checkpoint: - # save model - save_checkpoint( - model, - optimizer, - global_step, - epoch, - 1, - OUT_PATH, - model_characters, - model_loss=loss_dict["loss"], - ) - - # wait all kernels to be completed - torch.cuda.synchronize() - - # Diagnostic visualizations - # direct pass on model for spec predictions - target_speaker = None if speaker_c is None else speaker_c[:1] - - if hasattr(model, "module"): - spec_pred, *_ = model.module.inference(text_input[:1], text_lengths[:1], g=target_speaker) - else: - spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1], g=target_speaker) - - spec_pred = spec_pred.permute(0, 2, 1) - gt_spec = mel_input.permute(0, 2, 1) - const_spec = spec_pred[0].data.cpu().numpy() - gt_spec = gt_spec[0].data.cpu().numpy() - align_img = alignments[0].data.cpu().numpy() - - figures = { - "prediction": plot_spectrogram(const_spec, ap), - "ground_truth": plot_spectrogram(gt_spec, ap), - "alignment": plot_alignment(align_img), - } - - tb_logger.tb_train_figures(global_step, figures) - - # Sample audio - train_audio = ap.inv_melspectrogram(const_spec.T) - tb_logger.tb_train_audios(global_step, {"TrainAudio": train_audio}, config.audio["sample_rate"]) - end_time = time.time() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Epoch Stats - if args.rank == 0: - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - if config.tb_model_param_stats: - tb_logger.tb_model_weights(model, global_step) - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(data_loader, model, criterion, ap, global_step, epoch): - model.eval() - epoch_time = 0 - keep_avg = KeepAverage() - c_logger.print_eval_start() - if data_loader is not None: - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - text_input, text_lengths, mel_input, mel_lengths, speaker_c, _, _, attn_mask, _ = format_data(data) - - # forward pass model - z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward( - text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_c - ) - - # compute loss - loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, o_dur_log, o_total_dur, text_lengths) - - # step time - step_time = time.time() - start_time - epoch_time += step_time - - # compute alignment score - align_error = 1 - alignment_diagonal_score(alignments) - loss_dict["align_error"] = align_error - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["log_mle"] = reduce_tensor(loss_dict["log_mle"].data, num_gpus) - loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - keep_avg.update_values(update_train_values) - - if config.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Diagnostic visualizations - # direct pass on model for spec predictions - target_speaker = None if speaker_c is None else speaker_c[:1] - if hasattr(model, "module"): - spec_pred, *_ = model.module.inference(text_input[:1], text_lengths[:1], g=target_speaker) - else: - spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1], g=target_speaker) - spec_pred = spec_pred.permute(0, 2, 1) - gt_spec = mel_input.permute(0, 2, 1) - - const_spec = spec_pred[0].data.cpu().numpy() - gt_spec = gt_spec[0].data.cpu().numpy() - align_img = alignments[0].data.cpu().numpy() - - eval_figures = { - "prediction": plot_spectrogram(const_spec, ap), - "ground_truth": plot_spectrogram(gt_spec, ap), - "alignment": plot_alignment(align_img), - } - - # Sample audio - eval_audio = ap.inv_melspectrogram(const_spec.T) - tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, config.audio["sample_rate"]) - - # Plot Validation Stats - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - tb_logger.tb_eval_figures(global_step, eval_figures) - - if args.rank == 0 and epoch >= config.test_delay_epochs: - if config.test_sentences_file: - with open(config.test_sentences_file, "r") as f: - test_sentences = [s.strip() for s in f.readlines()] - else: - test_sentences = [ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963.", - ] - - # test sentences - test_audios = {} - test_figures = {} - print(" | > Synthesizing test sentences") - if config.use_speaker_embedding: - if config.use_external_speaker_embedding_file: - speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[randrange(len(speaker_mapping) - 1)]][ - "embedding" - ] - speaker_id = None - else: - speaker_id = 0 - speaker_embedding = None - else: - speaker_id = None - speaker_embedding = None - - style_wav = config.style_wav_for_test - for idx, test_sentence in enumerate(test_sentences): - try: - wav, alignment, _, postnet_output, _, _ = synthesis( - model, - test_sentence, - config, - use_cuda, - ap, - speaker_id=speaker_id, - speaker_embedding=speaker_embedding, - style_wav=style_wav, - truncated=False, - enable_eos_bos_chars=config.enable_eos_bos_chars, # pylint: disable=unused-argument - use_griffin_lim=True, - do_trim_silence=False, - ) - - file_path = os.path.join(AUDIO_PATH, str(global_step)) - os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) - ap.save_wav(wav, file_path) - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-prediction".format(idx)] = plot_spectrogram(postnet_output, ap) - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment) - except: # pylint: disable=bare-except - print(" !! Error creating Test Sentence -", idx) - traceback.print_exc() - tb_logger.tb_test_audios(global_step, test_audios, config.audio["sample_rate"]) - tb_logger.tb_test_figures(global_step, test_figures) - return keep_avg.avg_values - - -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global meta_data_train, meta_data_eval, symbols, phonemes, model_characters, speaker_mapping - # Audio processor - ap = AudioProcessor(**config.audio.to_dict()) - if config.has("characters") and config.characters: - symbols, phonemes = make_symbols(**config.characters.to_dict()) - - # DISTRUBUTED - if num_gpus > 1: - init_distributed(args.rank, num_gpus, args.group_id, config.distributed["backend"], config.distributed["url"]) - - # set model characters - model_characters = phonemes if config.use_phonemes else symbols - num_chars = len(model_characters) - - # load data instances - meta_data_train, meta_data_eval = load_meta_data(config.datasets) - - # parse speakers - num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(config, args, meta_data_train, OUT_PATH) - - # setup model - model = setup_model(num_chars, num_speakers, config, speaker_embedding_dim=speaker_embedding_dim) - optimizer = RAdam(model.parameters(), lr=config.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9) - criterion = GlowTTSLoss() - - if args.restore_path: - print(f" > Restoring from {os.path.basename(args.restore_path)} ...") - checkpoint = torch.load(args.restore_path, map_location="cpu") - try: - # TODO: fix optimizer init, model.cuda() needs to be called before - # optimizer restore - optimizer.load_state_dict(checkpoint["optimizer"]) - model.load_state_dict(checkpoint["model"]) - except: # pylint: disable=bare-except - print(" > Partial model initialization.") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], config) - model.load_state_dict(model_dict) - del model_dict - - for group in optimizer.param_groups: - group["initial_lr"] = config.lr - print(f" > Model restored from step {checkpoint['step']:d}", flush=True) - args.restore_step = checkpoint["step"] - else: - args.restore_step = 0 - - if use_cuda: - model.cuda() - criterion.cuda() - - # DISTRUBUTED - if num_gpus > 1: - model = DDP_th(model, device_ids=[args.rank]) - - if config.noam_schedule: - scheduler = NoamLR(optimizer, warmup_steps=config.warmup_steps, last_epoch=args.restore_step - 1) - else: - scheduler = None - - num_params = count_parameters(model) - print("\n > Model has {} parameters".format(num_params), flush=True) - - if args.restore_step == 0 or not args.best_path: - best_loss = float("inf") - print(" > Starting with inf best loss.") - else: - print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {best_loss}.") - keep_all_best = config.keep_all_best - keep_after = config.keep_after # void if keep_all_best False - - # define dataloaders - train_loader = setup_loader(ap, 1, is_val=False, verbose=True) - eval_loader = setup_loader(ap, 1, is_val=True, verbose=True) - - global_step = args.restore_step - model = data_depended_init(train_loader, model) - for epoch in range(0, config.epochs): - c_logger.print_epoch_start(epoch, config.epochs) - train_avg_loss_dict, global_step = train( - train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch - ) - eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = train_avg_loss_dict["avg_loss"] - if config.run_eval: - target_loss = eval_avg_loss_dict["avg_loss"] - best_loss = save_best_model( - target_loss, - best_loss, - model, - optimizer, - global_step, - epoch, - config.r, - OUT_PATH, - model_characters, - keep_all_best=keep_all_best, - keep_after=keep_after, - ) - - -if __name__ == "__main__": - args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py deleted file mode 100644 index 4dc3f5f0..00000000 --- a/TTS/bin/train_speedy_speech.py +++ /dev/null @@ -1,578 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import os -import sys -import time -import traceback -from random import randrange - -import numpy as np -import torch - -# DISTRIBUTED -from torch.nn.parallel import DistributedDataParallel as DDP_th -from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler - -from TTS.tts.datasets import load_meta_data -from TTS.tts.datasets.TTSDataset import TTSDataset -from TTS.tts.layers.losses import SpeedySpeechLoss -from TTS.tts.models import setup_model -from TTS.tts.utils.io import save_best_model, save_checkpoint -from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.speakers import parse_speakers -from TTS.tts.utils.synthesis import synthesis -from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.arguments import init_training -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import init_distributed, reduce_tensor -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.radam import RAdam -from TTS.utils.training import NoamLR, setup_torch_training_env - -use_cuda, num_gpus = setup_torch_training_env(True, False) - - -def setup_loader(ap, r, is_val=False, verbose=False): - if is_val and not config.run_eval: - loader = None - else: - dataset = TTSDataset( - r, - config.text_cleaner, - compute_linear_spec=False, - meta_data=meta_data_eval if is_val else meta_data_train, - ap=ap, - tp=config.characters, - add_blank=config["add_blank"], - batch_group_size=0 if is_val else config.batch_group_size * config.batch_size, - min_seq_len=config.min_seq_len, - max_seq_len=config.max_seq_len, - phoneme_cache_path=config.phoneme_cache_path, - use_phonemes=config.use_phonemes, - phoneme_language=config.phoneme_language, - enable_eos_bos=config.enable_eos_bos_chars, - use_noise_augment=not is_val, - verbose=verbose, - speaker_mapping=speaker_mapping - if config.use_speaker_embedding and config.use_external_speaker_embedding_file - else None, - ) - - if config.use_phonemes and config.compute_input_seq_cache: - # precompute phonemes to have a better estimate of sequence lengths. - dataset.compute_input_seq(config.num_loader_workers) - dataset.sort_items() - - sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=config.eval_batch_size if is_val else config.batch_size, - shuffle=False, - collate_fn=dataset.collate_fn, - drop_last=False, - sampler=sampler, - num_workers=config.num_val_loader_workers if is_val else config.num_loader_workers, - pin_memory=False, - ) - return loader - - -def format_data(data): - # setup input data - text_input = data[0] - text_lengths = data[1] - speaker_names = data[2] - mel_input = data[4].permute(0, 2, 1) # B x D x T - mel_lengths = data[5] - item_idx = data[7] - attn_mask = data[9] - avg_text_length = torch.mean(text_lengths.float()) - avg_spec_length = torch.mean(mel_lengths.float()) - - if config.use_speaker_embedding: - if config.use_external_speaker_embedding_file: - # return precomputed embedding vector - speaker_c = data[8] - else: - # return speaker_id to be used by an embedding layer - speaker_c = [speaker_mapping[speaker_name] for speaker_name in speaker_names] - speaker_c = torch.LongTensor(speaker_c) - else: - speaker_c = None - # compute durations from attention mask - durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2]) - for idx, am in enumerate(attn_mask): - # compute raw durations - c_idxs = am[:, : text_lengths[idx], : mel_lengths[idx]].max(1)[1] - # c_idxs, counts = torch.unique_consecutive(c_idxs, return_counts=True) - c_idxs, counts = torch.unique(c_idxs, return_counts=True) - dur = torch.ones([text_lengths[idx]]).to(counts.dtype) - dur[c_idxs] = counts - # smooth the durations and set any 0 duration to 1 - # by cutting off from the largest duration indeces. - extra_frames = dur.sum() - mel_lengths[idx] - largest_idxs = torch.argsort(-dur)[:extra_frames] - dur[largest_idxs] -= 1 - assert ( - dur.sum() == mel_lengths[idx] - ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" - durations[idx, : text_lengths[idx]] = dur - # dispatch data to GPU - if use_cuda: - text_input = text_input.cuda(non_blocking=True) - text_lengths = text_lengths.cuda(non_blocking=True) - mel_input = mel_input.cuda(non_blocking=True) - mel_lengths = mel_lengths.cuda(non_blocking=True) - if speaker_c is not None: - speaker_c = speaker_c.cuda(non_blocking=True) - attn_mask = attn_mask.cuda(non_blocking=True) - durations = durations.cuda(non_blocking=True) - return ( - text_input, - text_lengths, - mel_input, - mel_lengths, - speaker_c, - avg_text_length, - avg_spec_length, - attn_mask, - durations, - item_idx, - ) - - -def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch): - - model.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (config.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / config.batch_size) - end_time = time.time() - c_logger.print_train_start() - scaler = torch.cuda.amp.GradScaler() if config.mixed_precision else None - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - ( - text_input, - text_lengths, - mel_targets, - mel_lengths, - speaker_c, - avg_text_length, - avg_spec_length, - _, - dur_target, - _, - ) = format_data(data) - - loader_time = time.time() - end_time - - global_step += 1 - optimizer.zero_grad() - - # forward pass model - with torch.cuda.amp.autocast(enabled=config.mixed_precision): - decoder_output, dur_output, alignments = model.forward( - text_input, text_lengths, mel_lengths, dur_target, g=speaker_c - ) - - # compute loss - loss_dict = criterion( - decoder_output, mel_targets, mel_lengths, dur_output, torch.log(1 + dur_target), text_lengths - ) - - # backward pass with loss scaling - if config.mixed_precision: - scaler.scale(loss_dict["loss"]).backward() - scaler.unscale_(optimizer) - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) - scaler.step(optimizer) - scaler.update() - else: - loss_dict["loss"].backward() - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) - optimizer.step() - - # setup lr - if config.noam_schedule: - scheduler.step() - - # current_lr - current_lr = optimizer.param_groups[0]["lr"] - - # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(alignments, binary=True) - loss_dict["align_error"] = align_error - - step_time = time.time() - start_time - epoch_time += step_time - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["loss_l1"] = reduce_tensor(loss_dict["loss_l1"].data, num_gpus) - loss_dict["loss_ssim"] = reduce_tensor(loss_dict["loss_ssim"].data, num_gpus) - loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - keep_avg.update_values(update_train_values) - - # print training progress - if global_step % config.print_step == 0: - log_dict = { - "avg_spec_length": [avg_spec_length, 1], # value, precision - "avg_text_length": [avg_text_length, 1], - "step_time": [step_time, 4], - "loader_time": [loader_time, 2], - "current_lr": current_lr, - } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Plot Training Iter Stats - # reduce TB load - if global_step % config.tb_plot_step == 0: - iter_stats = {"lr": current_lr, "grad_norm": grad_norm, "step_time": step_time} - iter_stats.update(loss_dict) - tb_logger.tb_train_step_stats(global_step, iter_stats) - - if global_step % config.save_step == 0: - if config.checkpoint: - # save model - save_checkpoint( - model, - optimizer, - global_step, - epoch, - 1, - OUT_PATH, - model_characters, - model_loss=loss_dict["loss"], - ) - - # wait all kernels to be completed - torch.cuda.synchronize() - - # Diagnostic visualizations - idx = np.random.randint(mel_targets.shape[0]) - pred_spec = decoder_output[idx].detach().data.cpu().numpy().T - gt_spec = mel_targets[idx].data.cpu().numpy().T - align_img = alignments[idx].data.cpu() - - figures = { - "prediction": plot_spectrogram(pred_spec, ap), - "ground_truth": plot_spectrogram(gt_spec, ap), - "alignment": plot_alignment(align_img), - } - - tb_logger.tb_train_figures(global_step, figures) - - # Sample audio - train_audio = ap.inv_melspectrogram(pred_spec.T) - tb_logger.tb_train_audios(global_step, {"TrainAudio": train_audio}, config.audio["sample_rate"]) - end_time = time.time() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Epoch Stats - if args.rank == 0: - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - if config.tb_model_param_stats: - tb_logger.tb_model_weights(model, global_step) - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(data_loader, model, criterion, ap, global_step, epoch): - model.eval() - epoch_time = 0 - keep_avg = KeepAverage() - c_logger.print_eval_start() - if data_loader is not None: - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - text_input, text_lengths, mel_targets, mel_lengths, speaker_c, _, _, _, dur_target, _ = format_data(data) - - # forward pass model - with torch.cuda.amp.autocast(enabled=config.mixed_precision): - decoder_output, dur_output, alignments = model.forward( - text_input, text_lengths, mel_lengths, dur_target, g=speaker_c - ) - - # compute loss - loss_dict = criterion( - decoder_output, mel_targets, mel_lengths, dur_output, torch.log(1 + dur_target), text_lengths - ) - - # step time - step_time = time.time() - start_time - epoch_time += step_time - - # compute alignment score - align_error = 1 - alignment_diagonal_score(alignments, binary=True) - loss_dict["align_error"] = align_error - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["loss_l1"] = reduce_tensor(loss_dict["loss_l1"].data, num_gpus) - loss_dict["loss_ssim"] = reduce_tensor(loss_dict["loss_ssim"].data, num_gpus) - loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - keep_avg.update_values(update_train_values) - - if config.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Diagnostic visualizations - idx = np.random.randint(mel_targets.shape[0]) - pred_spec = decoder_output[idx].detach().data.cpu().numpy().T - gt_spec = mel_targets[idx].data.cpu().numpy().T - align_img = alignments[idx].data.cpu() - - eval_figures = { - "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), - "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), - "alignment": plot_alignment(align_img, output_fig=False), - } - - # Sample audio - eval_audio = ap.inv_melspectrogram(pred_spec.T) - tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, config.audio["sample_rate"]) - - # Plot Validation Stats - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - tb_logger.tb_eval_figures(global_step, eval_figures) - - if args.rank == 0 and epoch >= config.test_delay_epochs: - if config.test_sentences_file: - with open(config.test_sentences_file, "r") as f: - test_sentences = [s.strip() for s in f.readlines()] - else: - test_sentences = [ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963.", - ] - - # test sentences - test_audios = {} - test_figures = {} - print(" | > Synthesizing test sentences") - if config.use_speaker_embedding: - if config.use_external_speaker_embedding_file: - speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[randrange(len(speaker_mapping) - 1)]][ - "embedding" - ] - speaker_id = None - else: - speaker_id = 0 - speaker_embedding = None - else: - speaker_id = None - speaker_embedding = None - - for idx, test_sentence in enumerate(test_sentences): - try: - wav, alignment, _, postnet_output, _, _ = synthesis( - model, - test_sentence, - config, - use_cuda, - ap, - speaker_id=speaker_id, - speaker_embedding=speaker_embedding, - style_wav=None, - truncated=False, - enable_eos_bos_chars=config.enable_eos_bos_chars, # pylint: disable=unused-argument - use_griffin_lim=True, - do_trim_silence=False, - ) - - file_path = os.path.join(AUDIO_PATH, str(global_step)) - os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) - ap.save_wav(wav, file_path) - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-prediction".format(idx)] = plot_spectrogram(postnet_output, ap) - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment) - except: # pylint: disable=bare-except - print(" !! Error creating Test Sentence -", idx) - traceback.print_exc() - tb_logger.tb_test_audios(global_step, test_audios, config.audio["sample_rate"]) - tb_logger.tb_test_figures(global_step, test_figures) - return keep_avg.avg_values - - -# FIXME: move args definition/parsing inside of main? -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global meta_data_train, meta_data_eval, symbols, phonemes, model_characters, speaker_mapping - # Audio processor - ap = AudioProcessor(**config.audio.to_dict()) - if config.characters is not None: - symbols, phonemes = make_symbols(**config.characters.to_dict()) - - # DISTRUBUTED - if num_gpus > 1: - init_distributed(args.rank, num_gpus, args.group_id, config.distributed["backend"], config.distributed["url"]) - - # set model characters - model_characters = phonemes if config.use_phonemes else symbols - num_chars = len(model_characters) - - # load data instances - meta_data_train, meta_data_eval = load_meta_data(config.datasets, eval_split=True) - - # set the portion of the data used for training if set in config.json - if config.has("train_portion"): - meta_data_train = meta_data_train[: int(len(meta_data_train) * config.train_portion)] - if config.has("eval_portion"): - meta_data_eval = meta_data_eval[: int(len(meta_data_eval) * config.eval_portion)] - - # parse speakers - num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(config, args, meta_data_train, OUT_PATH) - - # setup model - model = setup_model(num_chars, num_speakers, config, speaker_embedding_dim=speaker_embedding_dim) - optimizer = RAdam(model.parameters(), lr=config.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9) - criterion = SpeedySpeechLoss(config) - - if args.restore_path: - print(f" > Restoring from {os.path.basename(args.restore_path)} ...") - checkpoint = torch.load(args.restore_path, map_location="cpu") - try: - # TODO: fix optimizer init, model.cuda() needs to be called before - # optimizer restore - optimizer.load_state_dict(checkpoint["optimizer"]) - if config.reinit_layers: - raise RuntimeError - model.load_state_dict(checkpoint["model"]) - except: # pylint: disable=bare-except - print(" > Partial model initialization.") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], config) - model.load_state_dict(model_dict) - del model_dict - - for group in optimizer.param_groups: - group["initial_lr"] = config.lr - print(" > Model restored from step %d" % checkpoint["step"], flush=True) - args.restore_step = checkpoint["step"] - else: - args.restore_step = 0 - - if use_cuda: - model.cuda() - criterion.cuda() - - # DISTRUBUTED - if num_gpus > 1: - model = DDP_th(model, device_ids=[args.rank]) - - if config.noam_schedule: - scheduler = NoamLR(optimizer, warmup_steps=config.warmup_steps, last_epoch=args.restore_step - 1) - else: - scheduler = None - - num_params = count_parameters(model) - print("\n > Model has {} parameters".format(num_params), flush=True) - - if args.restore_step == 0 or not args.best_path: - best_loss = float("inf") - print(" > Starting with inf best loss.") - else: - print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {best_loss}.") - keep_all_best = config.keep_all_best - keep_after = config.keep_after # void if keep_all_best False - - # define dataloaders - train_loader = setup_loader(ap, 1, is_val=False, verbose=True) - eval_loader = setup_loader(ap, 1, is_val=True, verbose=True) - - global_step = args.restore_step - for epoch in range(0, config.epochs): - c_logger.print_epoch_start(epoch, config.epochs) - train_avg_loss_dict, global_step = train( - train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch - ) - eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = train_avg_loss_dict["avg_loss"] - if config.run_eval: - target_loss = eval_avg_loss_dict["avg_loss"] - best_loss = save_best_model( - target_loss, - best_loss, - model, - optimizer, - global_step, - epoch, - config.r, - OUT_PATH, - model_characters, - keep_all_best=keep_all_best, - keep_after=keep_after, - ) - - -if __name__ == "__main__": - args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py deleted file mode 100755 index 69ffbb6c..00000000 --- a/TTS/bin/train_tacotron.py +++ /dev/null @@ -1,749 +0,0 @@ -#!/usr/bin/env python3 -"""Trains Tacotron based TTS models.""" - -import os -import sys -import time -import traceback -from random import randrange - -import numpy as np -import torch -from torch.utils.data import DataLoader - -from TTS.tts.datasets import load_meta_data -from TTS.tts.datasets.TTSDataset import TTSDataset -from TTS.tts.layers.losses import TacotronLoss -from TTS.tts.models import setup_model -from TTS.tts.utils.io import save_best_model, save_checkpoint -from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.speakers import parse_speakers -from TTS.tts.utils.synthesis import synthesis -from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.arguments import init_training -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import DistributedSampler, apply_gradient_allreduce, init_distributed, reduce_tensor -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.radam import RAdam -from TTS.utils.training import ( - NoamLR, - adam_weight_decay, - check_update, - gradual_training_scheduler, - set_weight_decay, - setup_torch_training_env, -) - -use_cuda, num_gpus = setup_torch_training_env(True, False) - - -def setup_loader(ap, r, is_val=False, verbose=False, dataset=None): - if is_val and not config.run_eval: - loader = None - else: - if dataset is None: - dataset = TTSDataset( - r, - config.text_cleaner, - compute_linear_spec=config.model.lower() == "tacotron", - meta_data=meta_data_eval if is_val else meta_data_train, - ap=ap, - tp=config.characters, - add_blank=config["add_blank"], - batch_group_size=0 if is_val else config.batch_group_size * config.batch_size, - min_seq_len=config.min_seq_len, - max_seq_len=config.max_seq_len, - phoneme_cache_path=config.phoneme_cache_path, - use_phonemes=config.use_phonemes, - phoneme_language=config.phoneme_language, - enable_eos_bos=config.enable_eos_bos_chars, - verbose=verbose, - speaker_mapping=( - speaker_mapping - if (config.use_speaker_embedding and config.use_external_speaker_embedding_file) - else None - ), - ) - - if config.use_phonemes and config.compute_input_seq_cache: - # precompute phonemes to have a better estimate of sequence lengths. - dataset.compute_input_seq(config.num_loader_workers) - dataset.sort_items() - - sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=config.eval_batch_size if is_val else config.batch_size, - shuffle=False, - collate_fn=dataset.collate_fn, - drop_last=False, - sampler=sampler, - num_workers=config.num_val_loader_workers if is_val else config.num_loader_workers, - pin_memory=False, - ) - return loader - - -def format_data(data): - # setup input data - text_input = data[0] - text_lengths = data[1] - speaker_names = data[2] - linear_input = data[3] if config.model.lower() in ["tacotron"] else None - mel_input = data[4] - mel_lengths = data[5] - stop_targets = data[6] - max_text_length = torch.max(text_lengths.float()) - max_spec_length = torch.max(mel_lengths.float()) - - if config.use_speaker_embedding: - if config.use_external_speaker_embedding_file: - speaker_embeddings = data[8] - speaker_ids = None - else: - speaker_ids = [speaker_mapping[speaker_name] for speaker_name in speaker_names] - speaker_ids = torch.LongTensor(speaker_ids) - speaker_embeddings = None - else: - speaker_embeddings = None - speaker_ids = None - - # set stop targets view, we predict a single stop token per iteration. - stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // config.r, -1) - stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) - - # dispatch data to GPU - if use_cuda: - text_input = text_input.cuda(non_blocking=True) - text_lengths = text_lengths.cuda(non_blocking=True) - mel_input = mel_input.cuda(non_blocking=True) - mel_lengths = mel_lengths.cuda(non_blocking=True) - linear_input = linear_input.cuda(non_blocking=True) if config.model.lower() in ["tacotron"] else None - stop_targets = stop_targets.cuda(non_blocking=True) - if speaker_ids is not None: - speaker_ids = speaker_ids.cuda(non_blocking=True) - if speaker_embeddings is not None: - speaker_embeddings = speaker_embeddings.cuda(non_blocking=True) - - return ( - text_input, - text_lengths, - mel_input, - mel_lengths, - linear_input, - stop_targets, - speaker_ids, - speaker_embeddings, - max_text_length, - max_spec_length, - ) - - -def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler, ap, global_step, epoch, scaler, scaler_st): - model.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (config.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / config.batch_size) - end_time = time.time() - c_logger.print_train_start() - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - ( - text_input, - text_lengths, - mel_input, - mel_lengths, - linear_input, - stop_targets, - speaker_ids, - speaker_embeddings, - max_text_length, - max_spec_length, - ) = format_data(data) - loader_time = time.time() - end_time - - global_step += 1 - - # setup lr - if config.noam_schedule: - scheduler.step() - - optimizer.zero_grad() - if optimizer_st: - optimizer_st.zero_grad() - - with torch.cuda.amp.autocast(enabled=config.mixed_precision): - # forward pass model - if config.bidirectional_decoder or config.double_decoder_consistency: - ( - decoder_output, - postnet_output, - alignments, - stop_tokens, - decoder_backward_output, - alignments_backward, - ) = model( - text_input, - text_lengths, - mel_input, - mel_lengths, - speaker_ids=speaker_ids, - speaker_embeddings=speaker_embeddings, - ) - else: - decoder_output, postnet_output, alignments, stop_tokens = model( - text_input, - text_lengths, - mel_input, - mel_lengths, - speaker_ids=speaker_ids, - speaker_embeddings=speaker_embeddings, - ) - decoder_backward_output = None - alignments_backward = None - - # set the [alignment] lengths wrt reduction factor for guided attention - if mel_lengths.max() % model.decoder.r != 0: - alignment_lengths = ( - mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r)) - ) // model.decoder.r - else: - alignment_lengths = mel_lengths // model.decoder.r - - # compute loss - loss_dict = criterion( - postnet_output, - decoder_output, - mel_input, - linear_input, - stop_tokens, - stop_targets, - mel_lengths, - decoder_backward_output, - alignments, - alignment_lengths, - alignments_backward, - text_lengths, - ) - - # check nan loss - if torch.isnan(loss_dict["loss"]).any(): - raise RuntimeError(f"Detected NaN loss at step {global_step}.") - - # optimizer step - if config.mixed_precision: - # model optimizer step in mixed precision mode - scaler.scale(loss_dict["loss"]).backward() - scaler.unscale_(optimizer) - optimizer, current_lr = adam_weight_decay(optimizer) - grad_norm, _ = check_update(model, config.grad_clip, ignore_stopnet=True) - scaler.step(optimizer) - scaler.update() - - # stopnet optimizer step - if config.separate_stopnet: - scaler_st.scale(loss_dict["stopnet_loss"]).backward() - scaler.unscale_(optimizer_st) - optimizer_st, _ = adam_weight_decay(optimizer_st) - grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) - scaler_st.step(optimizer) - scaler_st.update() - else: - grad_norm_st = 0 - else: - # main model optimizer step - loss_dict["loss"].backward() - optimizer, current_lr = adam_weight_decay(optimizer) - grad_norm, _ = check_update(model, config.grad_clip, ignore_stopnet=True) - optimizer.step() - - # stopnet optimizer step - if config.separate_stopnet: - loss_dict["stopnet_loss"].backward() - optimizer_st, _ = adam_weight_decay(optimizer_st) - grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) - optimizer_st.step() - else: - grad_norm_st = 0 - - # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(alignments) - loss_dict["align_error"] = align_error - - step_time = time.time() - start_time - epoch_time += step_time - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["postnet_loss"] = reduce_tensor(loss_dict["postnet_loss"].data, num_gpus) - loss_dict["decoder_loss"] = reduce_tensor(loss_dict["decoder_loss"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) - loss_dict["stopnet_loss"] = ( - reduce_tensor(loss_dict["stopnet_loss"].data, num_gpus) if config.stopnet else loss_dict["stopnet_loss"] - ) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - keep_avg.update_values(update_train_values) - - # print training progress - if global_step % config.print_step == 0: - log_dict = { - "max_spec_length": [max_spec_length, 1], # value, precision - "max_text_length": [max_text_length, 1], - "step_time": [step_time, 4], - "loader_time": [loader_time, 2], - "current_lr": current_lr, - } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Plot Training Iter Stats - # reduce TB load - if global_step % config.tb_plot_step == 0: - iter_stats = { - "lr": current_lr, - "grad_norm": grad_norm, - "grad_norm_st": grad_norm_st, - "step_time": step_time, - } - iter_stats.update(loss_dict) - tb_logger.tb_train_step_stats(global_step, iter_stats) - - if global_step % config.save_step == 0: - if config.checkpoint: - # save model - save_checkpoint( - model, - optimizer, - global_step, - epoch, - model.decoder.r, - OUT_PATH, - optimizer_st=optimizer_st, - model_loss=loss_dict["postnet_loss"], - characters=model_characters, - scaler=scaler.state_dict() if config.mixed_precision else None, - ) - - # Diagnostic visualizations - const_spec = postnet_output[0].data.cpu().numpy() - gt_spec = ( - linear_input[0].data.cpu().numpy() - if config.model in ["Tacotron", "TacotronGST"] - else mel_input[0].data.cpu().numpy() - ) - align_img = alignments[0].data.cpu().numpy() - - figures = { - "prediction": plot_spectrogram(const_spec, ap, output_fig=False), - "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), - "alignment": plot_alignment(align_img, output_fig=False), - } - - if config.bidirectional_decoder or config.double_decoder_consistency: - figures["alignment_backward"] = plot_alignment( - alignments_backward[0].data.cpu().numpy(), output_fig=False - ) - - tb_logger.tb_train_figures(global_step, figures) - - # Sample audio - if config.model in ["Tacotron", "TacotronGST"]: - train_audio = ap.inv_spectrogram(const_spec.T) - else: - train_audio = ap.inv_melspectrogram(const_spec.T) - tb_logger.tb_train_audios(global_step, {"TrainAudio": train_audio}, config.audio["sample_rate"]) - end_time = time.time() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Epoch Stats - if args.rank == 0: - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - if config.tb_model_param_stats: - tb_logger.tb_model_weights(model, global_step) - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(data_loader, model, criterion, ap, global_step, epoch): - model.eval() - epoch_time = 0 - keep_avg = KeepAverage() - c_logger.print_eval_start() - if data_loader is not None: - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - ( - text_input, - text_lengths, - mel_input, - mel_lengths, - linear_input, - stop_targets, - speaker_ids, - speaker_embeddings, - _, - _, - ) = format_data(data) - assert mel_input.shape[1] % model.decoder.r == 0 - - # forward pass model - if config.bidirectional_decoder or config.double_decoder_consistency: - ( - decoder_output, - postnet_output, - alignments, - stop_tokens, - decoder_backward_output, - alignments_backward, - ) = model( - text_input, text_lengths, mel_input, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings - ) - else: - decoder_output, postnet_output, alignments, stop_tokens = model( - text_input, text_lengths, mel_input, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings - ) - decoder_backward_output = None - alignments_backward = None - - # set the alignment lengths wrt reduction factor for guided attention - if mel_lengths.max() % model.decoder.r != 0: - alignment_lengths = ( - mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r)) - ) // model.decoder.r - else: - alignment_lengths = mel_lengths // model.decoder.r - - # compute loss - loss_dict = criterion( - postnet_output, - decoder_output, - mel_input, - linear_input, - stop_tokens, - stop_targets, - mel_lengths, - decoder_backward_output, - alignments, - alignment_lengths, - alignments_backward, - text_lengths, - ) - - # step time - step_time = time.time() - start_time - epoch_time += step_time - - # compute alignment score - align_error = 1 - alignment_diagonal_score(alignments) - loss_dict["align_error"] = align_error - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["postnet_loss"] = reduce_tensor(loss_dict["postnet_loss"].data, num_gpus) - loss_dict["decoder_loss"] = reduce_tensor(loss_dict["decoder_loss"].data, num_gpus) - if config.stopnet: - loss_dict["stopnet_loss"] = reduce_tensor(loss_dict["stopnet_loss"].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - keep_avg.update_values(update_train_values) - - if config.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Diagnostic visualizations - idx = np.random.randint(mel_input.shape[0]) - const_spec = postnet_output[idx].data.cpu().numpy() - gt_spec = ( - linear_input[idx].data.cpu().numpy() - if config.model in ["Tacotron", "TacotronGST"] - else mel_input[idx].data.cpu().numpy() - ) - align_img = alignments[idx].data.cpu().numpy() - - eval_figures = { - "prediction": plot_spectrogram(const_spec, ap, output_fig=False), - "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), - "alignment": plot_alignment(align_img, output_fig=False), - } - - # Sample audio - if config.model.lower() in ["tacotron"]: - eval_audio = ap.inv_spectrogram(const_spec.T) - else: - eval_audio = ap.inv_melspectrogram(const_spec.T) - tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, config.audio["sample_rate"]) - - # Plot Validation Stats - - if config.bidirectional_decoder or config.double_decoder_consistency: - align_b_img = alignments_backward[idx].data.cpu().numpy() - eval_figures["alignment2"] = plot_alignment(align_b_img, output_fig=False) - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - tb_logger.tb_eval_figures(global_step, eval_figures) - - if args.rank == 0 and epoch > config.test_delay_epochs: - if config.test_sentences_file: - with open(config.test_sentences_file, "r") as f: - test_sentences = [s.strip() for s in f.readlines()] - else: - test_sentences = [ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963.", - ] - - # test sentences - test_audios = {} - test_figures = {} - print(" | > Synthesizing test sentences") - speaker_id = 0 if config.use_speaker_embedding else None - speaker_embedding = ( - speaker_mapping[list(speaker_mapping.keys())[randrange(len(speaker_mapping) - 1)]]["embedding"] - if config.use_external_speaker_embedding_file and config.use_speaker_embedding - else None - ) - style_wav = config.gst_style_input - if style_wav is None and config.gst is not None: - # inicialize GST with zero dict. - style_wav = {} - print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") - for i in range(config.gst["gst_num_style_tokens"]): - style_wav[str(i)] = 0 - for idx, test_sentence in enumerate(test_sentences): - try: - wav, alignment, decoder_output, postnet_output, stop_tokens, _ = synthesis( - model, - test_sentence, - config, - use_cuda, - ap, - speaker_id=speaker_id, - speaker_embedding=speaker_embedding, - style_wav=style_wav, - truncated=False, - enable_eos_bos_chars=config.enable_eos_bos_chars, # pylint: disable=unused-argument - use_griffin_lim=True, - do_trim_silence=False, - ) - - file_path = os.path.join(AUDIO_PATH, str(global_step)) - os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) - ap.save_wav(wav, file_path) - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-prediction".format(idx)] = plot_spectrogram(postnet_output, ap, output_fig=False) - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment, output_fig=False) - except: # pylint: disable=bare-except - print(" !! Error creating Test Sentence -", idx) - traceback.print_exc() - tb_logger.tb_test_audios(global_step, test_audios, config.audio["sample_rate"]) - tb_logger.tb_test_figures(global_step, test_figures) - return keep_avg.avg_values - - -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global meta_data_train, meta_data_eval, speaker_mapping, symbols, phonemes, model_characters - # Audio processor - ap = AudioProcessor(**config.audio.to_dict()) - - # setup custom characters if set in config file. - if config.characters is not None: - symbols, phonemes = make_symbols(**config.characters.to_dict()) - - # DISTRUBUTED - if num_gpus > 1: - init_distributed(args.rank, num_gpus, args.group_id, config.distributed["backend"], config.distributed["url"]) - num_chars = len(phonemes) if config.use_phonemes else len(symbols) - model_characters = phonemes if config.use_phonemes else symbols - - # load data instances - meta_data_train, meta_data_eval = load_meta_data(config.datasets) - - # set the portion of the data used for training - if config.has("train_portion"): - meta_data_train = meta_data_train[: int(len(meta_data_train) * config.train_portion)] - if config.has("eval_portion"): - meta_data_eval = meta_data_eval[: int(len(meta_data_eval) * config.eval_portion)] - - # parse speakers - num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(config, args, meta_data_train, OUT_PATH) - - model = setup_model(num_chars, num_speakers, config, speaker_embedding_dim) - - # scalers for mixed precision training - scaler = torch.cuda.amp.GradScaler() if config.mixed_precision else None - scaler_st = torch.cuda.amp.GradScaler() if config.mixed_precision and config.separate_stopnet else None - - params = set_weight_decay(model, config.wd) - optimizer = RAdam(params, lr=config.lr, weight_decay=0) - if config.stopnet and config.separate_stopnet: - optimizer_st = RAdam(model.decoder.stopnet.parameters(), lr=config.lr, weight_decay=0) - else: - optimizer_st = None - - # setup criterion - criterion = TacotronLoss(config, stopnet_pos_weight=config.stopnet_pos_weight, ga_sigma=0.4) - if args.restore_path: - print(f" > Restoring from {os.path.basename(args.restore_path)}...") - checkpoint = torch.load(args.restore_path, map_location="cpu") - try: - print(" > Restoring Model...") - model.load_state_dict(checkpoint["model"]) - # optimizer restore - print(" > Restoring Optimizer...") - optimizer.load_state_dict(checkpoint["optimizer"]) - if "scaler" in checkpoint and config.mixed_precision: - print(" > Restoring AMP Scaler...") - scaler.load_state_dict(checkpoint["scaler"]) - except (KeyError, RuntimeError): - print(" > Partial model initialization...") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], config) - model.load_state_dict(model_dict) - del model_dict - - for group in optimizer.param_groups: - group["lr"] = config.lr - print(" > Model restored from step %d" % checkpoint["step"], flush=True) - args.restore_step = checkpoint["step"] - else: - args.restore_step = 0 - - if use_cuda: - model.cuda() - criterion.cuda() - - # DISTRUBUTED - if num_gpus > 1: - model = apply_gradient_allreduce(model) - - if config.noam_schedule: - scheduler = NoamLR(optimizer, warmup_steps=config.warmup_steps, last_epoch=args.restore_step - 1) - else: - scheduler = None - - num_params = count_parameters(model) - print("\n > Model has {} parameters".format(num_params), flush=True) - - if args.restore_step == 0 or not args.best_path: - best_loss = float("inf") - print(" > Starting with inf best loss.") - else: - print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {best_loss}.") - keep_all_best = config.keep_all_best - keep_after = config.keep_after # void if keep_all_best False - - # define data loaders - train_loader = setup_loader(ap, model.decoder.r, is_val=False, verbose=True) - eval_loader = setup_loader(ap, model.decoder.r, is_val=True) - - global_step = args.restore_step - for epoch in range(0, config.epochs): - c_logger.print_epoch_start(epoch, config.epochs) - # set gradual training - if config.gradual_training is not None: - r, config.batch_size = gradual_training_scheduler(global_step, config) - config.r = r - model.decoder.set_r(r) - if config.bidirectional_decoder: - model.decoder_backward.set_r(r) - train_loader.dataset.outputs_per_step = r - eval_loader.dataset.outputs_per_step = r - train_loader = setup_loader(ap, model.decoder.r, is_val=False, dataset=train_loader.dataset) - eval_loader = setup_loader(ap, model.decoder.r, is_val=True, dataset=eval_loader.dataset) - print("\n > Number of output frames:", model.decoder.r) - # train one epoch - train_avg_loss_dict, global_step = train( - train_loader, - model, - criterion, - optimizer, - optimizer_st, - scheduler, - ap, - global_step, - epoch, - scaler, - scaler_st, - ) - # eval one epoch - eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = train_avg_loss_dict["avg_postnet_loss"] - if config.run_eval: - target_loss = eval_avg_loss_dict["avg_postnet_loss"] - best_loss = save_best_model( - target_loss, - best_loss, - model, - optimizer, - global_step, - epoch, - config.r, - OUT_PATH, - model_characters, - keep_all_best=keep_all_best, - keep_after=keep_after, - scaler=scaler.state_dict() if config.mixed_precision else None, - ) - - -if __name__ == "__main__": - args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) From 26e7c0960c144ec06437e3a1288ef19315978af5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 May 2021 14:04:51 +0200 Subject: [PATCH 045/258] linter fixes --- TTS/bin/train_tts.py | 28 +++++------ TTS/trainer.py | 6 ++- TTS/tts/models/align_tts.py | 4 +- TTS/tts/models/glow_tts.py | 6 ++- TTS/tts/models/speedy_speech.py | 4 +- TTS/tts/models/tacotron.py | 2 +- TTS/tts/models/tacotron2.py | 3 +- TTS/tts/models/tacotron_abstract.py | 5 +- TTS/tts/utils/speakers.py | 74 ----------------------------- TTS/tts/utils/synthesis.py | 7 +-- 10 files changed, 34 insertions(+), 105 deletions(-) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 8182b23f..3270d0e0 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -8,20 +8,20 @@ from TTS.utils.generic_utils import remove_experiment_folder def main(): - # try: - args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - trainer = TrainerTTS(args, config, c_logger, tb_logger, output_path=OUT_PATH) - trainer.fit() - # except KeyboardInterrupt: - # remove_experiment_folder(OUT_PATH) - # try: - # sys.exit(0) - # except SystemExit: - # os._exit(0) # pylint: disable=protected-access - # except Exception: # pylint: disable=broad-except - # remove_experiment_folder(OUT_PATH) - # traceback.print_exc() - # sys.exit(1) + try: + args, config, output_path, _, c_logger, tb_logger = init_training(sys.argv) + trainer = TrainerTTS(args, config, c_logger, tb_logger, output_path=output_path) + trainer.fit() + except KeyboardInterrupt: + remove_experiment_folder(output_path) + try: + sys.exit(0) + except SystemExit: + os._exit(0) # pylint: disable=protected-access + except Exception: # pylint: disable=broad-except + remove_experiment_folder(output_path) + traceback.print_exc() + sys.exit(1) if __name__ == "__main__": diff --git a/TTS/trainer.py b/TTS/trainer.py index cb905d3a..34d73874 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -184,7 +184,7 @@ class TrainerTTS: @staticmethod def get_speaker_manager( - config: Coqpit, restore_path: str = "", out_path: str = "", data_train: List = [] + config: Coqpit, restore_path: str = "", out_path: str = "", data_train: List = None ) -> SpeakerManager: speaker_manager = SpeakerManager() if restore_path: @@ -208,7 +208,9 @@ class TrainerTTS: return speaker_manager @staticmethod - def get_scheduler(config: Coqpit, optimizer: torch.optim.Optimizer) -> torch.optim.lr_scheduler._LRScheduler: + def get_scheduler( + config: Coqpit, optimizer: torch.optim.Optimizer + ) -> torch.optim.lr_scheduler._LRScheduler: # pylint: disable=protected-access lr_scheduler = config.lr_scheduler lr_scheduler_params = config.lr_scheduler_params if lr_scheduler is None: diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index e8f80251..6efa64e2 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -275,7 +275,7 @@ class AlignTTS(nn.Module): g: [B, C] """ g = cond_input["x_vectors"] if "x_vectors" in cond_input else None - x_lengths = torch.tensor(x.shape[1:2]).to(x.device) + x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # pylint: disable=not-callable # pad input to prevent dropping the last word # x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0) o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) @@ -314,7 +314,7 @@ class AlignTTS(nn.Module): loss_dict["align_error"] = align_error return outputs, loss_dict - def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): + def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): # pylint: disable=no-self-use model_outputs = outputs["model_outputs"] alignments = outputs["alignments"] mel_input = batch["mel_input"] diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 8cf19f79..9f20f6bb 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -143,7 +143,9 @@ class GlowTTS(nn.Module): o_attn_dur = torch.log(1 + torch.sum(attn, -1)) * x_mask return y_mean, y_log_scale, o_attn_dur - def forward(self, x, x_lengths, y, y_lengths=None, cond_input={"x_vectors": None}): + def forward( + self, x, x_lengths, y, y_lengths=None, cond_input={"x_vectors": None} + ): # pylint: disable=dangerous-default-value """ Shapes: x: [B, T] @@ -344,7 +346,7 @@ class GlowTTS(nn.Module): loss_dict["align_error"] = align_error return outputs, loss_dict - def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): + def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): # pylint: disable=no-self-use model_outputs = outputs["model_outputs"] alignments = outputs["alignments"] mel_input = batch["mel_input"] diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index f00af9ad..96ef1740 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -183,7 +183,7 @@ class SpeedySpeech(nn.Module): g: [B, C] """ g = cond_input["x_vectors"] if "x_vectors" in cond_input else None - x_lengths = torch.tensor(x.shape[1:2]).to(x.device) + x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # pylint: disable=not-callable # input sequence should be greated than the max convolution size inference_padding = 5 if x.shape[1] < 13: @@ -226,7 +226,7 @@ class SpeedySpeech(nn.Module): loss_dict["align_error"] = align_error return outputs, loss_dict - def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): + def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): # pylint: disable=no-self-use model_outputs = outputs["model_outputs"] alignments = outputs["alignments"] mel_input = batch["mel_input"] diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 6059a0d2..da574c05 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -79,7 +79,7 @@ class Tacotron(TacotronAbstract): use_gst=False, gst=None, memory_size=5, - gradual_training=[], + gradual_training=None, ): super().__init__( num_chars, diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index b39a9d6f..14a838d7 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -1,5 +1,4 @@ # coding: utf-8 -import numpy as np import torch from torch import nn @@ -77,7 +76,7 @@ class Tacotron2(TacotronAbstract): speaker_embedding_dim=None, use_gst=False, gst=None, - gradual_training=[], + gradual_training=None, ): super().__init__( num_chars, diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py index 49487b67..8eb7bf24 100644 --- a/TTS/tts/models/tacotron_abstract.py +++ b/TTS/tts/models/tacotron_abstract.py @@ -1,5 +1,4 @@ import copy -import logging from abc import ABC, abstractmethod import torch @@ -37,7 +36,7 @@ class TacotronAbstract(ABC, nn.Module): speaker_embedding_dim=None, use_gst=False, gst=None, - gradual_training=[], + gradual_training=None, ): """Abstract Tacotron class""" super().__init__() @@ -239,4 +238,4 @@ class TacotronAbstract(ABC, nn.Module): trainer.model.decoder_backward.set_r(r) trainer.train_loader = trainer.setup_train_dataloader(self.ap, self.model.decoder.r, verbose=True) trainer.eval_loader = trainer.setup_eval_dataloder(self.ap, self.model.decoder.r) - logging.info(f"\n > Number of output frames: {self.decoder.r}") + print(f"\n > Number of output frames: {self.decoder.r}") diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 4bfe8299..3239e9a5 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -1,5 +1,4 @@ import json -import os import random from typing import Any, List, Union @@ -11,79 +10,6 @@ from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.utils.audio import AudioProcessor -def make_speakers_json_path(out_path): - """Returns conventional speakers.json location.""" - return os.path.join(out_path, "speakers.json") - - -def load_speaker_mapping(out_path): - """Loads speaker mapping if already present.""" - if os.path.splitext(out_path)[1] == ".json": - json_file = out_path - else: - json_file = make_speakers_json_path(out_path) - with open(json_file) as f: - return json.load(f) - - -def save_speaker_mapping(out_path, speaker_mapping): - """Saves speaker mapping if not yet present.""" - if out_path is not None: - speakers_json_path = make_speakers_json_path(out_path) - with open(speakers_json_path, "w") as f: - json.dump(speaker_mapping, f, indent=4) - - -def parse_speakers(c, args, meta_data_train, OUT_PATH): - """Returns number of speakers, speaker embedding shape and speaker mapping""" - if c.use_speaker_embedding: - speakers = get_speakers(meta_data_train) - if args.restore_path: - if c.use_external_speaker_embedding_file: # if restore checkpoint and use External Embedding file - prev_out_path = os.path.dirname(args.restore_path) - speaker_mapping = load_speaker_mapping(prev_out_path) - if not speaker_mapping: - print( - "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" - ) - speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) - if not speaker_mapping: - raise RuntimeError( - "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file" - ) - speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"]) - elif ( - not c.use_external_speaker_embedding_file - ): # if restore checkpoint and don't use External Embedding file - prev_out_path = os.path.dirname(args.restore_path) - speaker_mapping = load_speaker_mapping(prev_out_path) - speaker_embedding_dim = None - assert all(speaker in speaker_mapping for speaker in speakers), ( - "As of now you, you cannot " "introduce new speakers to " "a previously trained model." - ) - elif ( - c.use_external_speaker_embedding_file and c.external_speaker_embedding_file - ): # if start new train using External Embedding file - speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) - speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"]) - elif ( - c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file - ): # if start new train using External Embedding file and don't pass external embedding file - raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" - else: # if start new train and don't use External Embedding file - speaker_mapping = {name: i for i, name in enumerate(speakers)} - speaker_embedding_dim = None - save_speaker_mapping(OUT_PATH, speaker_mapping) - num_speakers = len(speaker_mapping) - print(" > Training with {} speakers: {}".format(len(speakers), ", ".join(speakers))) - else: - num_speakers = 0 - speaker_embedding_dim = None - speaker_mapping = None - - return num_speakers, speaker_embedding_dim, speaker_mapping - - class SpeakerManager: """It manages the multi-speaker setup for 🐸TTS models. It loads the speaker files and parses the information in a way that you can query. There are 3 different scenarios considered. diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 7c896dfe..35b7d818 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -230,15 +230,16 @@ def synthesis( outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, x_vector=x_vector) model_outputs = outputs["model_outputs"] model_outputs = model_outputs[0].data.cpu().numpy() + alignments = outputs["alignments"] elif backend == "tf": decoder_output, postnet_output, alignments, stop_tokens = run_model_tf( model, text_inputs, CONFIG, speaker_id, style_mel ) - model_outputs, decoder_output, alignment, stop_tokens = parse_outputs_tf( + model_outputs, decoder_output, alignments, stop_tokens = parse_outputs_tf( postnet_output, decoder_output, alignments, stop_tokens ) elif backend == "tflite": - decoder_output, postnet_output, alignment, stop_tokens = run_model_tflite( + decoder_output, postnet_output, alignments, stop_tokens = run_model_tflite( model, text_inputs, CONFIG, speaker_id, style_mel ) model_outputs, decoder_output = parse_outputs_tflite(postnet_output, decoder_output) @@ -252,7 +253,7 @@ def synthesis( wav = trim_silence(wav, ap) return_dict = { "wav": wav, - "alignments": outputs["alignments"], + "alignments": alignments, "model_outputs": model_outputs, "text_inputs": text_inputs, } From d4b1acfa81dc3e63c0a3bc90aede8763e881fef5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 May 2021 15:46:28 +0200 Subject: [PATCH 046/258] refactor `SpeakerManager` --- TTS/tts/utils/speakers.py | 199 +++++++++++++++++++++++++++++++++----- 1 file changed, 177 insertions(+), 22 deletions(-) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 3239e9a5..5c10c589 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -1,6 +1,7 @@ import json +import os import random -from typing import Any, List, Union +from typing import Any, Dict, List, Tuple, Union import numpy as np import torch @@ -10,6 +11,71 @@ from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.utils.audio import AudioProcessor +def make_speakers_json_path(out_path): + """Returns conventional speakers.json location.""" + return os.path.join(out_path, "speakers.json") + + +def load_speaker_mapping(out_path): + """Loads speaker mapping if already present.""" + if os.path.splitext(out_path)[1] == ".json": + json_file = out_path + else: + json_file = make_speakers_json_path(out_path) + with open(json_file) as f: + return json.load(f) + + +def save_speaker_mapping(out_path, speaker_mapping): + """Saves speaker mapping if not yet present.""" + if out_path is not None: + speakers_json_path = make_speakers_json_path(out_path) + with open(speakers_json_path, "w") as f: + json.dump(speaker_mapping, f, indent=4) + + +def get_speaker_manager(c, args, meta_data_train): + """Inititalize and return a `SpeakerManager` based on config values""" + speaker_manager = SpeakerManager() + if c.use_speaker_embedding: + speaker_manager.set_speaker_ids_from_data(meta_data_train) + if args.restore_path: + # restoring speaker manager from a previous run. + if c.use_external_speaker_embedding_file: + # restore speaker manager with the embedding file + speakers_file = os.path.dirname(args.restore_path) + if not os.path.exists(speakers_file): + print( + "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" + ) + if not os.path.exists(c.external_speaker_embedding_file): + raise RuntimeError( + "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file" + ) + speaker_manager.load_x_vectors_file(c.external_speaker_embedding_file) + speaker_manager.set_x_vectors_from_file(speakers_file) + elif not c.use_external_speaker_embedding_file: # restor speaker manager with speaker ID file. + speakers_file = os.path.dirname(args.restore_path) + speaker_ids_from_data = speaker_manager.speaker_ids + speaker_manager.set_speaker_ids_from_file(speakers_file) + assert all( + speaker in speaker_manager.speaker_ids for speaker in speaker_ids_from_data + ), " [!] You cannot introduce new speakers to a pre-trained model." + elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: + # new speaker manager with external speaker embeddings. + speaker_manager.set_x_vectors_from_file(c.external_speaker_embedding_file) + elif ( + c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file + ): # new speaker manager with speaker IDs file. + raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" + print( + " > Training with {} speakers: {}".format( + speaker_manager.num_speakers, ", ".join(speaker_manager.speaker_ids) + ) + ) + return speaker_manager + + class SpeakerManager: """It manages the multi-speaker setup for 🐸TTS models. It loads the speaker files and parses the information in a way that you can query. There are 3 different scenarios considered. @@ -64,24 +130,24 @@ class SpeakerManager: self.speaker_encoder_ap = None if data_items: - self.speaker_ids = self.parse_speakers() + self.speaker_ids, _ = self.parse_speakers_from_data(self.data_items) if x_vectors_file_path: - self.load_x_vectors_file(x_vectors_file_path) + self.set_x_vectors_from_file(x_vectors_file_path) if speaker_id_file_path: - self.load_ids_file(speaker_id_file_path) + self.set_speaker_ids_from_file(speaker_id_file_path) if encoder_model_path and encoder_config_path: self.init_speaker_encoder(encoder_model_path, encoder_config_path) @staticmethod - def _load_json(json_file_path: str): + def _load_json(json_file_path: str) -> Dict: with open(json_file_path) as f: return json.load(f) @staticmethod - def _save_json(json_file_path: str, data: dict): + def _save_json(json_file_path: str, data: dict) -> None: with open(json_file_path, "w") as f: json.dump(data, f, indent=4) @@ -91,35 +157,101 @@ class SpeakerManager: @property def x_vector_dim(self): - return len(self.x_vectors[list(self.x_vectors.keys())[0]]["embedding"]) + """Dimensionality of x_vectors. If x_vectors are not loaded, returns zero.""" + if self.x_vectors: + return len(self.x_vectors[list(self.x_vectors.keys())[0]]["embedding"]) + return 0 - def parse_speakers_from_items(self, items: list): + @staticmethod + def parse_speakers_from_data(items: list) -> Tuple[Dict, int]: + """Parse speaker IDs from data samples retured by `load_meta_data()`. + + Args: + items (list): Data sampled returned by `load_meta_data()`. + + Returns: + Tuple[Dict, int]: speaker IDs and number of speakers. + """ speakers = sorted({item[2] for item in items}) - self.speaker_ids = {name: i for i, name in enumerate(speakers)} - num_speakers = len(self.speaker_ids) - return self.speaker_ids, num_speakers + speaker_ids = {name: i for i, name in enumerate(speakers)} + num_speakers = len(speaker_ids) + return speaker_ids, num_speakers - def save_ids_file(self, file_path: str): - self._save_json(file_path, self.speaker_ids) + def set_speaker_ids_from_data(self, items: List) -> None: + """Set speaker IDs from data samples. - def load_ids_file(self, file_path: str): + Args: + items (List): Data sampled returned by `load_meta_data()`. + """ + self.speaker_ids, _ = self.parse_speakers_from_data(items) + + def set_speaker_ids_from_file(self, file_path: str) -> None: + """Set speaker IDs from a file. + + Args: + file_path (str): Path to the file. + """ self.speaker_ids = self._load_json(file_path) - def save_x_vectors_file(self, file_path: str): + def save_speaker_ids_to_file(self, file_path: str) -> None: + """Save speaker IDs to a json file. + + Args: + file_path (str): Path to the output file. + """ + self._save_json(file_path, self.speaker_ids) + + def save_x_vectors_to_file(self, file_path: str) -> None: + """Save x_vectors to a json file. + + Args: + file_path (str): Path to the output file. + """ self._save_json(file_path, self.x_vectors) - def load_x_vectors_file(self, file_path: str): + def set_x_vectors_from_file(self, file_path: str) -> None: + """Load x_vectors from a json file. + + Args: + file_path (str): Path to the target json file. + """ self.x_vectors = self._load_json(file_path) self.speaker_ids = list(set(sorted(x["name"] for x in self.x_vectors.values()))) self.clip_ids = list(set(sorted(clip_name for clip_name in self.x_vectors.keys()))) - def get_x_vector_by_clip(self, clip_idx: str): + def get_x_vector_by_clip(self, clip_idx: str) -> List: + """Get x_vector by clip ID. + + Args: + clip_idx (str): Target clip ID. + + Returns: + List: x_vector as a list. + """ return self.x_vectors[clip_idx]["embedding"] - def get_x_vectors_by_speaker(self, speaker_idx: str): + def get_x_vectors_by_speaker(self, speaker_idx: str) -> List[List]: + """Get all x_vectors of a speaker. + + Args: + speaker_idx (str): Target speaker ID. + + Returns: + List[List]: all the x_vectors of the given speaker. + """ return [x["embedding"] for x in self.x_vectors.values() if x["name"] == speaker_idx] - def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False): + def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.Array: + """Get mean x_vector of a speaker ID. + + Args: + speaker_idx (str): Target speaker ID. + num_samples (int, optional): Number of samples to be averaged. Defaults to None. + randomize (bool, optional): Pick random `num_samples`of x_vectors. Defaults to False. + + Returns: + np.Array: Mean x_vector. + """ x_vectors = self.get_x_vectors_by_speaker(speaker_idx) if num_samples is None: x_vectors = np.stack(x_vectors).mean(0) @@ -131,13 +263,19 @@ class SpeakerManager: x_vectors = np.stack(x_vectors[:num_samples]).mean(0) return x_vectors - def get_speakers(self): + def get_speakers(self) -> List: return self.speaker_ids - def get_clips(self): + def get_clips(self) -> List: return sorted(self.x_vectors.keys()) def init_speaker_encoder(self, model_path: str, config_path: str) -> None: + """Initialize a speaker encoder model. + + Args: + model_path (str): Model file path. + config_path (str): Model config file path. + """ self.speaker_encoder_config = load_config(config_path) self.speaker_encoder = setup_model(self.speaker_encoder_config) self.speaker_encoder.load_checkpoint(config_path, model_path, True) @@ -147,6 +285,15 @@ class SpeakerManager: self.speaker_encoder_ap.do_trim_silence = True def compute_x_vector_from_clip(self, wav_file: Union[str, list]) -> list: + """Compute a x_vector from a given audio file. + + Args: + wav_file (Union[str, list]): Target file path. + + Returns: + list: Computed x_vector. + """ + def _compute(wav_file: str): waveform = self.speaker_encoder_ap.load_wav(wav_file, sr=self.speaker_encoder_ap.sample_rate) spec = self.speaker_encoder_ap.melspectrogram(waveform) @@ -168,7 +315,15 @@ class SpeakerManager: x_vector = _compute(wav_file) return x_vector[0].tolist() - def compute_x_vector(self, feats): + def compute_x_vector(self, feats: Union[torch.Tensor, np.Array]) -> List: + """Compute x_vector from features. + + Args: + feats (Union[torch.Tensor, np.Array]): Input features. + + Returns: + List: computed x_vector. + """ if isinstance(feats, np.ndarray): feats = torch.from_numpy(feats) if feats.ndim == 2: From 223502d82743a8c5f9843c8023657fd370ab7f13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 May 2021 19:30:50 +0200 Subject: [PATCH 047/258] fix glow-tts inference and forward functions for handling `cond_input` and refactor its test --- TTS/tts/models/glow_tts.py | 28 ++++++++++++++++++++-------- tests/tts_tests/test_glow_tts.py | 25 ++++++++++++++++--------- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 9f20f6bb..2c944008 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -154,10 +154,10 @@ class GlowTTS(nn.Module): y_lengths: B g: [B, C] or B """ - y_max_length = y.size(2) y = y.transpose(1, 2) + y_max_length = y.size(2) # norm speaker embeddings - g = cond_input["x_vectors"] + g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None if g is not None: if self.speaker_embedding_dim: g = F.normalize(g).unsqueeze(-1) @@ -196,19 +196,23 @@ class GlowTTS(nn.Module): return outputs @torch.no_grad() - def inference_with_MAS(self, x, x_lengths, y=None, y_lengths=None, attn=None, g=None): + def inference_with_MAS( + self, x, x_lengths, y=None, y_lengths=None, cond_input={"x_vectors": None} + ): # pylint: disable=dangerous-default-value """ It's similar to the teacher forcing in Tacotron. It was proposed in: https://arxiv.org/abs/2104.05557 Shapes: x: [B, T] x_lenghts: B - y: [B, C, T] + y: [B, T, C] y_lengths: B g: [B, C] or B """ + y = y.transpose(1, 2) y_max_length = y.size(2) # norm speaker embeddings + g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None if g is not None: if self.external_speaker_embedding_dim: g = F.normalize(g).unsqueeze(-1) @@ -253,14 +257,18 @@ class GlowTTS(nn.Module): return outputs @torch.no_grad() - def decoder_inference(self, y, y_lengths=None, g=None): + def decoder_inference( + self, y, y_lengths=None, cond_input={"x_vectors": None} + ): # pylint: disable=dangerous-default-value """ Shapes: - y: [B, C, T] + y: [B, T, C] y_lengths: B g: [B, C] or B """ + y = y.transpose(1, 2) y_max_length = y.size(2) + g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None # norm speaker embeddings if g is not None: if self.external_speaker_embedding_dim: @@ -276,10 +284,14 @@ class GlowTTS(nn.Module): # reverse decoder and predict y, logdet = self.decoder(z, y_mask, g=g, reverse=True) - return y, logdet + outputs = {} + outputs["model_outputs"] = y + outputs["logdet"] = logdet + return outputs @torch.no_grad() - def inference(self, x, x_lengths, g=None): + def inference(self, x, x_lengths, cond_input={"x_vectors": None}): # pylint: disable=dangerous-default-value + g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None if g is not None: if self.speaker_embedding_dim: g = F.normalize(g).unsqueeze(-1) diff --git a/tests/tts_tests/test_glow_tts.py b/tests/tts_tests/test_glow_tts.py index 486de274..8a2a8fb3 100644 --- a/tests/tts_tests/test_glow_tts.py +++ b/tests/tts_tests/test_glow_tts.py @@ -34,7 +34,7 @@ class GlowTTSTrainTest(unittest.TestCase): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, c.audio["num_mels"], 30).to(device) + mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) speaker_ids = torch.randint(0, 5, (8,)).long().to(device) @@ -114,10 +114,17 @@ class GlowTTSTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=0.001) for _ in range(5): optimizer.zero_grad() - z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, None + outputs = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, None) + loss_dict = criterion( + outputs["model_outputs"], + outputs["y_mean"], + outputs["y_log_scale"], + outputs["logdet"], + mel_lengths, + outputs["durations_log"], + outputs["total_durations_log"], + input_lengths, ) - loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, o_dur_log, o_total_dur, input_lengths) loss = loss_dict["loss"] loss.backward() optimizer.step() @@ -137,7 +144,7 @@ class GlowTTSInferenceTest(unittest.TestCase): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, c.audio["num_mels"], 30).to(device) + mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) speaker_ids = torch.randint(0, 5, (8,)).long().to(device) @@ -175,12 +182,12 @@ class GlowTTSInferenceTest(unittest.TestCase): print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) # inference encoder and decoder with MAS - y, *_ = model.inference_with_MAS(input_dummy, input_lengths, mel_spec, mel_lengths, None) + y = model.inference_with_MAS(input_dummy, input_lengths, mel_spec, mel_lengths) - y_dec, _ = model.decoder_inference(mel_spec, mel_lengths) + y2 = model.decoder_inference(mel_spec, mel_lengths) assert ( - y_dec.shape == y.shape + y2["model_outputs"].shape == y["model_outputs"].shape ), "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format( - y.shape, y_dec.shape + y["model_outputs"].shape, y2["model_outputs"].shape ) From 254707c610f1c1475d639e150acdc774f1bba149 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 31 May 2021 10:07:12 +0200 Subject: [PATCH 048/258] update imports for `formatters` --- TTS/bin/compute_attention_masks.py | 2 +- TTS/bin/find_unique_chars.py | 6 +++--- notebooks/dataset_analysis/AnalyzeDataset.ipynb | 2 +- notebooks/dataset_analysis/PhonemeCoverage.ipynb | 2 +- tests/data_tests/test_dataset_formatters.py | 2 +- tests/data_tests/test_loader.py | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index 3cbf40ba..eb708040 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -80,7 +80,7 @@ Example run: model.eval() # data loader - preprocessor = importlib.import_module("TTS.tts.datasets.preprocess") + preprocessor = importlib.import_module("TTS.tts.datasets.formatters") preprocessor = getattr(preprocessor, args.dataset) meta_data = preprocessor(args.data_path, args.dataset_metafile) dataset = TTSDataset( diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py index 7891d65a..75169569 100644 --- a/TTS/bin/find_unique_chars.py +++ b/TTS/bin/find_unique_chars.py @@ -3,14 +3,14 @@ import argparse import os from argparse import RawTextHelpFormatter -from TTS.tts.datasets.preprocess import get_preprocessor_by_name +from TTS.tts.datasets.formatters import get_preprocessor_by_name def main(): # pylint: disable=bad-option-value parser = argparse.ArgumentParser( description="""Find all the unique characters or phonemes in a dataset.\n\n""" - """Target dataset must be defined in TTS.tts.datasets.preprocess\n\n""" + """Target dataset must be defined in TTS.tts.datasets.formatters\n\n""" """ Example runs: @@ -20,7 +20,7 @@ def main(): ) parser.add_argument( - "--dataset", type=str, default="", help="One of the target dataset names in TTS.tts.datasets.preprocess." + "--dataset", type=str, default="", help="One of the target dataset names in TTS.tts.datasets.formatters." ) parser.add_argument("--meta_file", type=str, default=None, help="Path to the transcriptions file of the dataset.") diff --git a/notebooks/dataset_analysis/AnalyzeDataset.ipynb b/notebooks/dataset_analysis/AnalyzeDataset.ipynb index 8aa3a025..6ff2d2ca 100644 --- a/notebooks/dataset_analysis/AnalyzeDataset.ipynb +++ b/notebooks/dataset_analysis/AnalyzeDataset.ipynb @@ -31,7 +31,7 @@ "from multiprocessing import Pool\n", "from matplotlib import pylab as plt\n", "from collections import Counter\n", - "from TTS.tts.datasets.preprocess import *\n", + "from TTS.tts.datasets.formatters import *\n", "%matplotlib inline" ] }, diff --git a/notebooks/dataset_analysis/PhonemeCoverage.ipynb b/notebooks/dataset_analysis/PhonemeCoverage.ipynb index f9540d06..e659511a 100644 --- a/notebooks/dataset_analysis/PhonemeCoverage.ipynb +++ b/notebooks/dataset_analysis/PhonemeCoverage.ipynb @@ -50,7 +50,7 @@ "source": [ "# import stuff\n", "from TTS.utils.io import load_config\n", - "from TTS.tts.datasets.preprocess import load_meta_data\n", + "from TTS.tts.datasets.formatters import load_meta_data\n", "from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme\n", "from tqdm import tqdm\n", "from matplotlib import pylab as plt\n", diff --git a/tests/data_tests/test_dataset_formatters.py b/tests/data_tests/test_dataset_formatters.py index 968e2a29..bd83002c 100644 --- a/tests/data_tests/test_dataset_formatters.py +++ b/tests/data_tests/test_dataset_formatters.py @@ -2,7 +2,7 @@ import os import unittest from tests import get_tests_input_path -from TTS.tts.datasets.preprocess import common_voice +from TTS.tts.datasets.formatters import common_voice class TestPreprocessors(unittest.TestCase): diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index 053da516..7f55b378 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -9,7 +9,7 @@ from torch.utils.data import DataLoader from tests import get_tests_output_path from TTS.tts.configs import BaseTTSConfig from TTS.tts.datasets import TTSDataset -from TTS.tts.datasets.preprocess import ljspeech +from TTS.tts.datasets.formatters import ljspeech from TTS.utils.audio import AudioProcessor # pylint: disable=unused-variable From f568833d2882b17668239cb551a126c4c51519b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 31 May 2021 15:41:17 +0200 Subject: [PATCH 049/258] formating `cond_input` with a function in Tacotron models --- TTS/tts/models/tacotron.py | 2 ++ TTS/tts/models/tacotron2.py | 2 ++ TTS/tts/models/tacotron_abstract.py | 6 ++++++ TTS/utils/generic_utils.py | 17 +++++++++++++++++ 4 files changed, 27 insertions(+) diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index da574c05..8d3124c3 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -191,6 +191,7 @@ class Tacotron(TacotronAbstract): mel_lengths: [B] cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] """ + cond_input = self._format_cond_input(cond_input) outputs = {"alignments_backward": None, "decoder_outputs_backward": None} input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) # B x T_in x embed_dim @@ -250,6 +251,7 @@ class Tacotron(TacotronAbstract): @torch.no_grad() def inference(self, text_input, cond_input=None): + cond_input = self._format_cond_input(cond_input) inputs = self.embedding(text_input) encoder_outputs = self.encoder(inputs) if self.gst and self.use_gst: diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 14a838d7..bd1ad03e 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -191,6 +191,7 @@ class Tacotron2(TacotronAbstract): mel_lengths: [B] cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] """ + cond_input = self._format_cond_input(cond_input) outputs = {"alignments_backward": None, "decoder_outputs_backward": None} # compute mask for padding # B x T_in_max (boolean) @@ -248,6 +249,7 @@ class Tacotron2(TacotronAbstract): @torch.no_grad() def inference(self, text, cond_input=None): + cond_input = self._format_cond_input(cond_input) embedded_inputs = self.embedding(text).transpose(1, 2) encoder_outputs = self.encoder.inference(embedded_inputs) diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py index 8eb7bf24..5e561066 100644 --- a/TTS/tts/models/tacotron_abstract.py +++ b/TTS/tts/models/tacotron_abstract.py @@ -1,10 +1,12 @@ import copy from abc import ABC, abstractmethod +from typing import Dict import torch from torch import nn from TTS.tts.utils.data import sequence_mask +from TTS.utils.generic_utils import format_cond_input from TTS.utils.training import gradual_training_scheduler @@ -94,6 +96,10 @@ class TacotronAbstract(ABC, nn.Module): self.decoder_backward = None self.coarse_decoder = None + @staticmethod + def _format_cond_input(cond_input: Dict) -> Dict: + return format_cond_input({"x_vectors": None, "speaker_ids": None}, cond_input) + ############################# # INIT FUNCTIONS ############################# diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index a562e86f..0c28116d 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -8,6 +8,7 @@ import shutil import subprocess import sys from pathlib import Path +from typing import Dict import torch @@ -126,6 +127,22 @@ def set_init_dict(model_dict, checkpoint_state, c): return model_dict +def format_cond_input(def_args: Dict, kwargs: Dict) -> Dict: + """Format kwargs to hande auxilary inputs to models. + + Args: + def_args (Dict): A dictionary of argument names and their default values if not defined in `kwargs`. + kwargs (Dict): A `dict` or `kwargs` that includes auxilary inputs to the model. + + Returns: + Dict: arguments with formatted auxilary inputs. + """ + for name in def_args: + if name not in kwargs: + kwargs[def_args[name]] = None + return kwargs + + class KeepAverage: def __init__(self): self.avg_values = {} From 506189bdee3b1837d2716a4d89b7b5184ceb4a7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 31 May 2021 15:42:07 +0200 Subject: [PATCH 050/258] update glow-tts output shapes to match [B, T, C] --- TTS/tts/models/glow_tts.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 2c944008..af52ba1c 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -185,13 +185,13 @@ class GlowTTS(nn.Module): y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask) attn = attn.squeeze(1).permute(0, 2, 1) outputs = { - "model_outputs": z, + "model_outputs": z.transpose(1, 2), "logdet": logdet, - "y_mean": y_mean, - "y_log_scale": y_log_scale, + "y_mean": y_mean.transpose(1, 2), + "y_log_scale": y_log_scale.transpose(1, 2), "alignments": attn, - "durations_log": o_dur_log, - "total_durations_log": o_attn_dur, + "durations_log": o_dur_log.transpose(1, 2), + "total_durations_log": o_attn_dur.transpose(1, 2), } return outputs @@ -246,13 +246,13 @@ class GlowTTS(nn.Module): # reverse the decoder and predict using the aligned distribution y, logdet = self.decoder(z, y_mask, g=g, reverse=True) outputs = { - "model_outputs": y, + "model_outputs": z.transpose(1, 2), "logdet": logdet, - "y_mean": y_mean, - "y_log_scale": y_log_scale, + "y_mean": y_mean.transpose(1, 2), + "y_log_scale": y_log_scale.transpose(1, 2), "alignments": attn, - "durations_log": o_dur_log, - "total_durations_log": o_attn_dur, + "durations_log": o_dur_log.transpose(1, 2), + "total_durations_log": o_attn_dur.transpose(1, 2), } return outputs @@ -285,7 +285,7 @@ class GlowTTS(nn.Module): y, logdet = self.decoder(z, y_mask, g=g, reverse=True) outputs = {} - outputs["model_outputs"] = y + outputs["model_outputs"] = y.transpose(1, 2) outputs["logdet"] = logdet return outputs @@ -317,13 +317,13 @@ class GlowTTS(nn.Module): y, logdet = self.decoder(z, y_mask, g=g, reverse=True) attn = attn.squeeze(1).permute(0, 2, 1) outputs = { - "model_outputs": y, + "model_outputs": y.transpose(1, 2), "logdet": logdet, - "y_mean": y_mean, - "y_log_scale": y_log_scale, + "y_mean": y_mean.transpose(1, 2), + "y_log_scale": y_log_scale.transpose(1, 2), "alignments": attn, - "durations_log": o_dur_log, - "total_durations_log": o_attn_dur, + "durations_log": o_dur_log.transpose(1, 2), + "total_durations_log": o_attn_dur.transpose(1, 2), } return outputs From 9960c0c3561e796d9cebc329f598c3972a10b2ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 31 May 2021 15:43:40 +0200 Subject: [PATCH 051/258] update test for the new input output API of the tts models --- tests/data_tests/test_loader.py | 2 +- tests/tts_tests/test_speedy_speech_layers.py | 23 +++++-- tests/tts_tests/test_tacotron2_model.py | 72 ++++++++++---------- tests/tts_tests/test_tacotron_model.py | 50 +++++++------- 4 files changed, 79 insertions(+), 68 deletions(-) diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index 7f55b378..cad89d09 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -38,7 +38,7 @@ class TestTTSDataset(unittest.TestCase): def _create_dataloader(self, batch_size, r, bgs): items = ljspeech(c.data_path, "metadata.csv") - dataset = TTSDataset.TTSDataset( + dataset = TTSDataset( r, c.text_cleaner, compute_linear_spec=True, diff --git a/tests/tts_tests/test_speedy_speech_layers.py b/tests/tts_tests/test_speedy_speech_layers.py index 21a73812..66339a82 100644 --- a/tests/tts_tests/test_speedy_speech_layers.py +++ b/tests/tts_tests/test_speedy_speech_layers.py @@ -45,17 +45,25 @@ def test_speedy_speech(): model.cuda() # forward pass - o_de, o_dr, attn = model(x_dummy, x_lengths, y_lengths, durations) + outputs = model(x_dummy, x_lengths, y_lengths, durations) + o_de = outputs["model_outputs"] + attn = outputs["alignments"] + o_dr = outputs["durations_log"] - assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}" + assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}" assert list(attn.shape) == [B, T_de, T_en] assert list(o_dr.shape) == [B, T_en] # with speaker embedding model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128, num_speakers=10, c_in_channels=256).to(device) - model.forward(x_dummy, x_lengths, y_lengths, durations, g=torch.randint(0, 10, (B,)).to(device)) + model.forward( + x_dummy, x_lengths, y_lengths, durations, cond_input={"x_vectors": torch.randint(0, 10, (B,)).to(device)} + ) + o_de = outputs["model_outputs"] + attn = outputs["alignments"] + o_dr = outputs["durations_log"] - assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}" + assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}" assert list(attn.shape) == [B, T_de, T_en] assert list(o_dr.shape) == [B, T_en] @@ -63,8 +71,11 @@ def test_speedy_speech(): model = SpeedySpeech( num_chars, out_channels=80, hidden_channels=128, num_speakers=10, external_c=True, c_in_channels=256 ).to(device) - model.forward(x_dummy, x_lengths, y_lengths, durations, g=torch.rand((B, 256)).to(device)) + model.forward(x_dummy, x_lengths, y_lengths, durations, cond_input={"x_vectors": torch.rand((B, 256)).to(device)}) + o_de = outputs["model_outputs"] + attn = outputs["alignments"] + o_dr = outputs["durations_log"] - assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}" + assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}" assert list(attn.shape) == [B, T_de, T_en] assert list(o_dr.shape) == [B, T_en] diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index 4d711700..0933ec70 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -52,15 +52,15 @@ class TacotronTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): - mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} ) - assert torch.sigmoid(stop_tokens).data.max() <= 1.0 - assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -85,7 +85,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) - speaker_embeddings = torch.rand(8, 55).to(device) + speaker_ids = torch.rand(8, 55).to(device) for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 @@ -104,15 +104,15 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): - mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_ids} ) - assert torch.sigmoid(stop_tokens).data.max() <= 1.0 - assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -157,15 +157,15 @@ class TacotronGSTTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(10): - mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} ) - assert torch.sigmoid(stop_tokens).data.max() <= 1.0 - assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -213,15 +213,15 @@ class TacotronGSTTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(10): - mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} ) - assert torch.sigmoid(stop_tokens).data.max() <= 1.0 - assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -270,15 +270,15 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): - mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings} ) - assert torch.sigmoid(stop_tokens).data.max() <= 1.0 - assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py index fcbac0f7..86de5d16 100644 --- a/tests/tts_tests/test_tacotron_model.py +++ b/tests/tts_tests/test_tacotron_model.py @@ -68,13 +68,13 @@ class TacotronTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): - mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} ) optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -129,13 +129,13 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): - mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings} ) optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -193,13 +193,13 @@ class TacotronGSTTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(10): - mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} ) optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -256,13 +256,13 @@ class TacotronGSTTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(10): - mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} ) optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -318,13 +318,13 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): - mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings} ) optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes From a58e986f684ba71450be9eaf4aa8be1b12c23dfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 31 May 2021 15:44:09 +0200 Subject: [PATCH 052/258] reduce fullband-melgan test model size --- tests/vocoder_tests/test_fullband_melgan_train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py index 2b286b91..6e533eb9 100644 --- a/tests/vocoder_tests/test_fullband_melgan_train.py +++ b/tests/vocoder_tests/test_fullband_melgan_train.py @@ -22,6 +22,7 @@ config = FullbandMelganConfig( print_eval=True, discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, data_path="tests/data/ljspeech", + discriminator_model_params={"base_channels": 16, "max_channels": 128, "downsample_factors": [4, 4, 4]}, output_path=output_path, ) config.audio.do_trim_silence = True From ab7f299d485f7501ffbe9649606034e70749400c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 2 Jun 2021 10:16:38 +0200 Subject: [PATCH 053/258] update test to be less demanding --- tests/vocoder_tests/test_fullband_melgan_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py index 6e533eb9..fbce03eb 100644 --- a/tests/vocoder_tests/test_fullband_melgan_train.py +++ b/tests/vocoder_tests/test_fullband_melgan_train.py @@ -22,7 +22,7 @@ config = FullbandMelganConfig( print_eval=True, discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, data_path="tests/data/ljspeech", - discriminator_model_params={"base_channels": 16, "max_channels": 128, "downsample_factors": [4, 4, 4]}, + discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, output_path=output_path, ) config.audio.do_trim_silence = True From e229f5c081ce1cf3cec90bc50b8459827144fecd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 1 Jun 2021 09:45:59 +0200 Subject: [PATCH 054/258] fix type annotations --- TTS/tts/utils/speakers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 5c10c589..cebf0dca 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -241,7 +241,7 @@ class SpeakerManager: """ return [x["embedding"] for x in self.x_vectors.values() if x["name"] == speaker_idx] - def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.Array: + def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: """Get mean x_vector of a speaker ID. Args: @@ -250,7 +250,7 @@ class SpeakerManager: randomize (bool, optional): Pick random `num_samples`of x_vectors. Defaults to False. Returns: - np.Array: Mean x_vector. + np.ndarray: Mean x_vector. """ x_vectors = self.get_x_vectors_by_speaker(speaker_idx) if num_samples is None: @@ -315,11 +315,11 @@ class SpeakerManager: x_vector = _compute(wav_file) return x_vector[0].tolist() - def compute_x_vector(self, feats: Union[torch.Tensor, np.Array]) -> List: + def compute_x_vector(self, feats: Union[torch.Tensor, np.ndarray]) -> List: """Compute x_vector from features. Args: - feats (Union[torch.Tensor, np.Array]): Input features. + feats (Union[torch.Tensor, np.ndarray]): Input features. Returns: List: computed x_vector. From 120ea679f9acaf769818e24a945eae27ef07ca6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 1 Jun 2021 10:48:13 +0200 Subject: [PATCH 055/258] add `test_all` to makefile --- Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile b/Makefile index 4dc2d588..70b7e34a 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,10 @@ dev-deps: ## install development deps deps: ## install 🐸 requirements. pip install -r requirements.txt +test_all: ## run tests and don't stop on an error. + nosetests --with-cov -cov --cover-erase --cover-package TTS tests --nologcapture --with-id + ./run_bash_tests.sh + test: ## run tests. nosetests -x --with-cov -cov --cover-erase --cover-package TTS tests --nologcapture --with-id ./run_bash_tests.sh From 534401377d4b9a8a894705d4daf28241a2102602 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 14:41:13 +0200 Subject: [PATCH 056/258] styling formatting.py --- TTS/utils/arguments.py | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 9d92ae82..55bad4f2 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -29,16 +29,16 @@ def init_arguments(argv): parser.add_argument( "--continue_path", type=str, - help=( - "Training output folder to continue training. Used to continue " - "a training. If it is used, 'config_path' is ignored." - ), + help=("Training output folder to continue training. Used to continue " + "a training. If it is used, 'config_path' is ignored."), default="", required="--config_path" not in argv, ) parser.add_argument( - "--restore_path", type=str, help="Model file to be restored. Use to finetune a model.", default="" - ) + "--restore_path", + type=str, + help="Model file to be restored. Use to finetune a model.", + default="") parser.add_argument( "--best_path", type=str, @@ -48,12 +48,23 @@ def init_arguments(argv): ), default="", ) + parser.add_argument("--config_path", + type=str, + help="Path to config file for training.", + required="--continue_path" not in argv) + parser.add_argument("--debug", + type=bool, + default=False, + help="Do not verify commit integrity to run training.") parser.add_argument( - "--config_path", type=str, help="Path to config file for training.", required="--continue_path" not in argv - ) - parser.add_argument("--debug", type=bool, default=False, help="Do not verify commit integrity to run training.") - parser.add_argument("--rank", type=int, default=0, help="DISTRIBUTED: process rank for distributed training.") - parser.add_argument("--group_id", type=str, default="", help="DISTRIBUTED: process group id.") + "--rank", + type=int, + default=0, + help="DISTRIBUTED: process rank for distributed training.") + parser.add_argument("--group_id", + type=str, + default="", + help="DISTRIBUTED: process group id.") return parser @@ -148,7 +159,8 @@ def process_args(args): print(" > Mixed precision mode is ON") experiment_path = args.continue_path if not experiment_path: - experiment_path = create_experiment_folder(config.output_path, config.run_name, args.debug) + experiment_path = create_experiment_folder(config.output_path, + config.run_name, args.debug) audio_path = os.path.join(experiment_path, "test_audios") # setup rank 0 process in distributed training tb_logger = None @@ -169,7 +181,8 @@ def process_args(args): os.chmod(experiment_path, 0o775) tb_logger = TensorboardLogger(experiment_path, model_name=config.model) # write model desc to tensorboard - tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) + tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", + 0) c_logger = ConsoleLogger() return config, experiment_path, audio_path, c_logger, tb_logger From 7a0750a4f55d6c0cb41cb02501bbfc32275c557c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 17:25:00 +0200 Subject: [PATCH 057/258] make style --- TTS/tts/models/align_tts.py | 2 +- TTS/tts/models/speedy_speech.py | 2 +- TTS/tts/models/tacotron.py | 4 +--- TTS/utils/arguments.py | 39 +++++++++++---------------------- 4 files changed, 16 insertions(+), 31 deletions(-) diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 6efa64e2..3e8d4adc 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -275,7 +275,7 @@ class AlignTTS(nn.Module): g: [B, C] """ g = cond_input["x_vectors"] if "x_vectors" in cond_input else None - x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # pylint: disable=not-callable + x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # pad input to prevent dropping the last word # x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0) o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index 96ef1740..455dbf38 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -183,7 +183,7 @@ class SpeedySpeech(nn.Module): g: [B, C] """ g = cond_input["x_vectors"] if "x_vectors" in cond_input else None - x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # pylint: disable=not-callable + x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # input sequence should be greated than the max convolution size inference_padding = 5 if x.shape[1] < 13: diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 8d3124c3..12c3e5f9 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -191,11 +191,9 @@ class Tacotron(TacotronAbstract): mel_lengths: [B] cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] """ - cond_input = self._format_cond_input(cond_input) outputs = {"alignments_backward": None, "decoder_outputs_backward": None} - input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) - # B x T_in x embed_dim inputs = self.embedding(text) + input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) # B x T_in x encoder_in_features encoder_outputs = self.encoder(inputs) # sequence masking diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 55bad4f2..9d92ae82 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -29,16 +29,16 @@ def init_arguments(argv): parser.add_argument( "--continue_path", type=str, - help=("Training output folder to continue training. Used to continue " - "a training. If it is used, 'config_path' is ignored."), + help=( + "Training output folder to continue training. Used to continue " + "a training. If it is used, 'config_path' is ignored." + ), default="", required="--config_path" not in argv, ) parser.add_argument( - "--restore_path", - type=str, - help="Model file to be restored. Use to finetune a model.", - default="") + "--restore_path", type=str, help="Model file to be restored. Use to finetune a model.", default="" + ) parser.add_argument( "--best_path", type=str, @@ -48,23 +48,12 @@ def init_arguments(argv): ), default="", ) - parser.add_argument("--config_path", - type=str, - help="Path to config file for training.", - required="--continue_path" not in argv) - parser.add_argument("--debug", - type=bool, - default=False, - help="Do not verify commit integrity to run training.") parser.add_argument( - "--rank", - type=int, - default=0, - help="DISTRIBUTED: process rank for distributed training.") - parser.add_argument("--group_id", - type=str, - default="", - help="DISTRIBUTED: process group id.") + "--config_path", type=str, help="Path to config file for training.", required="--continue_path" not in argv + ) + parser.add_argument("--debug", type=bool, default=False, help="Do not verify commit integrity to run training.") + parser.add_argument("--rank", type=int, default=0, help="DISTRIBUTED: process rank for distributed training.") + parser.add_argument("--group_id", type=str, default="", help="DISTRIBUTED: process group id.") return parser @@ -159,8 +148,7 @@ def process_args(args): print(" > Mixed precision mode is ON") experiment_path = args.continue_path if not experiment_path: - experiment_path = create_experiment_folder(config.output_path, - config.run_name, args.debug) + experiment_path = create_experiment_folder(config.output_path, config.run_name, args.debug) audio_path = os.path.join(experiment_path, "test_audios") # setup rank 0 process in distributed training tb_logger = None @@ -181,8 +169,7 @@ def process_args(args): os.chmod(experiment_path, 0o775) tb_logger = TensorboardLogger(experiment_path, model_name=config.model) # write model desc to tensorboard - tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", - 0) + tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) c_logger = ConsoleLogger() return config, experiment_path, audio_path, c_logger, tb_logger From 2e31659dd9044eaa838fbff43024ef05548eb822 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 1 Jun 2021 14:35:15 +0200 Subject: [PATCH 058/258] docstring fix --- TTS/tts/models/tacotron2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index bd1ad03e..68867ec8 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -45,7 +45,7 @@ class Tacotron2(TacotronAbstract): speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None. use_gst (bool, optional): enable/disable Global style token module. gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. - gradual_trainin (List): Gradual training schedule. If None or `[]`, no gradual training is used. + gradual_training (List): Gradual training schedule. If None or `[]`, no gradual training is used. Defaults to `[]`. """ From c09622459e367702cf6f26fddb9a6a2496e5fd2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 1 Jun 2021 14:53:57 +0200 Subject: [PATCH 059/258] init `durations = None` --- TTS/trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TTS/trainer.py b/TTS/trainer.py index 34d73874..d81132cf 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -351,6 +351,7 @@ class TrainerTTS: speaker_ids = None # compute durations from attention masks + durations = None if attn_mask is not None: durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2]) for idx, am in enumerate(attn_mask): From 26a3312f0d24ac0a791ca0b1cc6c824c581a5bc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 1 Jun 2021 16:26:28 +0200 Subject: [PATCH 060/258] change `to(device)` to `type_as` in models --- TTS/tts/models/tacotron_abstract.py | 12 +++++------- TTS/tts/tf/utils/generic_utils.py | 3 +-- TTS/tts/utils/ssim.py | 9 +-------- TTS/vocoder/models/wavernn.py | 13 ++++++------- TTS/vocoder/utils/distribution.py | 4 +--- 5 files changed, 14 insertions(+), 27 deletions(-) diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py index 5e561066..fe43d81f 100644 --- a/TTS/tts/models/tacotron_abstract.py +++ b/TTS/tts/models/tacotron_abstract.py @@ -145,14 +145,13 @@ class TacotronAbstract(ABC, nn.Module): def compute_masks(self, text_lengths, mel_lengths): """Compute masks against sequence paddings.""" # B x T_in_max (boolean) - device = text_lengths.device - input_mask = sequence_mask(text_lengths).to(device) + input_mask = sequence_mask(text_lengths) output_mask = None if mel_lengths is not None: max_len = mel_lengths.max() r = self.decoder.r max_len = max_len + (r - (max_len % r)) if max_len % r > 0 else max_len - output_mask = sequence_mask(mel_lengths, max_len=max_len).to(device) + output_mask = sequence_mask(mel_lengths, max_len=max_len) return input_mask, output_mask def _backward_pass(self, mel_specs, encoder_outputs, mask): @@ -195,20 +194,19 @@ class TacotronAbstract(ABC, nn.Module): def compute_gst(self, inputs, style_input, speaker_embedding=None): """Compute global style token""" - device = inputs.device if isinstance(style_input, dict): - query = torch.zeros(1, 1, self.gst.gst_embedding_dim // 2).to(device) + query = torch.zeros(1, 1, self.gst.gst_embedding_dim // 2).type_as(inputs) if speaker_embedding is not None: query = torch.cat([query, speaker_embedding.reshape(1, 1, -1)], dim=-1) _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) - gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).to(device) + gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).type_as(inputs) for k_token, v_amplifier in style_input.items(): key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) gst_outputs = gst_outputs + gst_outputs_att * v_amplifier elif style_input is None: - gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).to(device) + gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).type_as(inputs) else: gst_outputs = self.gst_layer(style_input, speaker_embedding) # pylint: disable=not-callable inputs = self._concat_speaker_embedding(inputs, gst_outputs) diff --git a/TTS/tts/tf/utils/generic_utils.py b/TTS/tts/tf/utils/generic_utils.py index 5b8b4ce2..e76893c2 100644 --- a/TTS/tts/tf/utils/generic_utils.py +++ b/TTS/tts/tf/utils/generic_utils.py @@ -44,8 +44,7 @@ def sequence_mask(sequence_length, max_len=None): batch_size = sequence_length.size(0) seq_range = np.empty([0, max_len], dtype=np.int8) seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - if sequence_length.is_cuda: - seq_range_expand = seq_range_expand.cuda() + seq_range_expand = seq_range_expand.type_as(sequence_length) seq_length_expand = sequence_length.unsqueeze(1).expand_as(seq_range_expand) # B x T_max return seq_range_expand < seq_length_expand diff --git a/TTS/tts/utils/ssim.py b/TTS/tts/utils/ssim.py index 11107e47..caed575f 100644 --- a/TTS/tts/utils/ssim.py +++ b/TTS/tts/utils/ssim.py @@ -56,9 +56,6 @@ class SSIM(torch.nn.Module): window = self.window else: window = create_window(self.window_size, channel) - - if img1.is_cuda: - window = window.cuda(img1.get_device()) window = window.type_as(img1) self.window = window @@ -69,10 +66,6 @@ class SSIM(torch.nn.Module): def ssim(img1, img2, window_size=11, size_average=True): (_, channel, _, _) = img1.size() - window = create_window(window_size, channel) - - if img1.is_cuda: - window = window.cuda(img1.get_device()) + window = create_window(window_size, channel).type_as(img1) window = window.type_as(img1) - return _ssim(img1, img2, window, window_size, channel, size_average) diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 994244dc..04040931 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -251,7 +251,6 @@ class WaveRNN(nn.Module): def inference(self, mels, batched=None, target=None, overlap=None): self.eval() - device = mels.device output = [] start = time.time() rnn1 = self.get_gru_cell(self.rnn1) @@ -259,7 +258,7 @@ class WaveRNN(nn.Module): with torch.no_grad(): if isinstance(mels, np.ndarray): - mels = torch.FloatTensor(mels).to(device) + mels = torch.FloatTensor(mels).type_as(mels) if mels.ndim == 2: mels = mels.unsqueeze(0) @@ -275,9 +274,9 @@ class WaveRNN(nn.Module): b_size, seq_len, _ = mels.size() - h1 = torch.zeros(b_size, self.rnn_dims).to(device) - h2 = torch.zeros(b_size, self.rnn_dims).to(device) - x = torch.zeros(b_size, 1).to(device) + h1 = torch.zeros(b_size, self.rnn_dims).type_as(mels) + h2 = torch.zeros(b_size, self.rnn_dims).type_as(mels) + x = torch.zeros(b_size, 1).type_as(mels) if self.use_aux_net: d = self.aux_dims @@ -310,11 +309,11 @@ class WaveRNN(nn.Module): if self.mode == "mold": sample = sample_from_discretized_mix_logistic(logits.unsqueeze(0).transpose(1, 2)) output.append(sample.view(-1)) - x = sample.transpose(0, 1).to(device) + x = sample.transpose(0, 1).type_as(mels) elif self.mode == "gauss": sample = sample_from_gaussian(logits.unsqueeze(0).transpose(1, 2)) output.append(sample.view(-1)) - x = sample.transpose(0, 1).to(device) + x = sample.transpose(0, 1).type_as(mels) elif isinstance(self.mode, int): posterior = F.softmax(logits, dim=1) distrib = torch.distributions.Categorical(posterior) diff --git a/TTS/vocoder/utils/distribution.py b/TTS/vocoder/utils/distribution.py index 5c2742c8..43d0d884 100644 --- a/TTS/vocoder/utils/distribution.py +++ b/TTS/vocoder/utils/distribution.py @@ -149,8 +149,6 @@ def sample_from_discretized_mix_logistic(y, log_scale_min=None): def to_one_hot(tensor, n, fill_with=1.0): # we perform one hot encore with respect to the last axis - one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_() - if tensor.is_cuda: - one_hot = one_hot.cuda() + one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_().type_as(tensor) one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with) return one_hot From fdce9eb2344b5ceea9ae66b1365d3f744352e1e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 2 Jun 2021 10:17:58 +0200 Subject: [PATCH 061/258] reduce size of the metadata.csv used at testing --- tests/data/ljspeech/metadata.csv | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/tests/data/ljspeech/metadata.csv b/tests/data/ljspeech/metadata.csv index 8f7832b5..6c65ca0d 100644 --- a/tests/data/ljspeech/metadata.csv +++ b/tests/data/ljspeech/metadata.csv @@ -6,27 +6,3 @@ LJ001-0005|the invention of movable metal letters in the middle of the fifteenth LJ001-0006|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography, LJ001-0007|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five, LJ001-0008|has never been surpassed.|has never been surpassed. -LJ001-0009|Printing, then, for our purpose, may be considered as the art of making books by means of movable types.|Printing, then, for our purpose, may be considered as the art of making books by means of movable types. -LJ001-0010|Now, as all books not primarily intended as picture-books consist principally of types composed to form letterpress,|Now, as all books not primarily intended as picture-books consist principally of types composed to form letterpress, -LJ001-0011|it is of the first importance that the letter used should be fine in form;|it is of the first importance that the letter used should be fine in form; -LJ001-0012|especially as no more time is occupied, or cost incurred, in casting, setting, or printing beautiful letters|especially as no more time is occupied, or cost incurred, in casting, setting, or printing beautiful letters -LJ001-0013|than in the same operations with ugly ones.|than in the same operations with ugly ones. -LJ001-0014|And it was a matter of course that in the Middle Ages, when the craftsmen took care that beautiful form should always be a part of their productions whatever they were,|And it was a matter of course that in the Middle Ages, when the craftsmen took care that beautiful form should always be a part of their productions whatever they were, -LJ001-0015|the forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves.|the forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. -LJ001-0016|The Middle Ages brought calligraphy to perfection, and it was natural therefore|The Middle Ages brought calligraphy to perfection, and it was natural therefore -LJ001-0017|that the forms of printed letters should follow more or less closely those of the written character, and they followed them very closely.|that the forms of printed letters should follow more or less closely those of the written character, and they followed them very closely. -LJ001-0018|The first books were printed in black letter, i.e. the letter which was a Gothic development of the ancient Roman character,|The first books were printed in black letter, i.e. the letter which was a Gothic development of the ancient Roman character, -LJ001-0019|and which developed more completely and satisfactorily on the side of the "lower-case" than the capital letters;|and which developed more completely and satisfactorily on the side of the "lower-case" than the capital letters; -LJ001-0020|the "lower-case" being in fact invented in the early Middle Ages.|the "lower-case" being in fact invented in the early Middle Ages. -LJ001-0021|The earliest book printed with movable type, the aforesaid Gutenberg Bible, is printed in letters which are an exact imitation|The earliest book printed with movable type, the aforesaid Gutenberg Bible, is printed in letters which are an exact imitation -LJ001-0022|of the more formal ecclesiastical writing which obtained at that time; this has since been called "missal type,"|of the more formal ecclesiastical writing which obtained at that time; this has since been called "missal type," -LJ001-0023|and was in fact the kind of letter used in the many splendid missals, psalters, etc., produced by printing in the fifteenth century.|and was in fact the kind of letter used in the many splendid missals, psalters, etc., produced by printing in the fifteenth century. -LJ001-0024|But the first Bible actually dated (which also was printed at Maintz by Peter Schoeffer in the year 1462)|But the first Bible actually dated (which also was printed at Maintz by Peter Schoeffer in the year fourteen sixty-two) -LJ001-0025|imitates a much freer hand, simpler, rounder, and less spiky, and therefore far pleasanter and easier to read.|imitates a much freer hand, simpler, rounder, and less spiky, and therefore far pleasanter and easier to read. -LJ001-0026|On the whole the type of this book may be considered the ne-plus-ultra of Gothic type,|On the whole the type of this book may be considered the ne-plus-ultra of Gothic type, -LJ001-0027|especially as regards the lower-case letters; and type very similar was used during the next fifteen or twenty years not only by Schoeffer,|especially as regards the lower-case letters; and type very similar was used during the next fifteen or twenty years not only by Schoeffer, -LJ001-0028|but by printers in Strasburg, Basle, Paris, Lubeck, and other cities.|but by printers in Strasburg, Basle, Paris, Lubeck, and other cities. -LJ001-0029|But though on the whole, except in Italy, Gothic letter was most often used|But though on the whole, except in Italy, Gothic letter was most often used -LJ001-0030|a very few years saw the birth of Roman character not only in Italy, but in Germany and France.|a very few years saw the birth of Roman character not only in Italy, but in Germany and France. -LJ001-0031|In 1465 Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome,|In fourteen sixty-five Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome, -LJ001-0032|and used an exceedingly beautiful type, which is indeed to look at a transition between Gothic and Roman,|and used an exceedingly beautiful type, which is indeed to look at a transition between Gothic and Roman, \ No newline at end of file From e7b7268c43d142353a6d647efbe5cea6d57cad65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 3 Jun 2021 15:05:39 +0200 Subject: [PATCH 062/258] use `to_cuda()` for moving data in `format_batch()` --- TTS/trainer.py | 22 +++++++++++----------- TTS/tts/datasets/TTSDataset.py | 2 +- TTS/utils/generic_utils.py | 9 +++++++++ 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index d81132cf..8ec59f55 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -27,7 +27,7 @@ from TTS.tts.utils.text.symbols import make_symbols from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor from TTS.utils.distribute import init_distributed -from TTS.utils.generic_utils import KeepAverage, count_parameters, set_init_dict +from TTS.utils.generic_utils import KeepAverage, count_parameters, set_init_dict, to_cuda from TTS.utils.logging import ConsoleLogger, TensorboardLogger from TTS.utils.training import check_update, setup_torch_training_env @@ -377,18 +377,18 @@ class TrainerTTS: # dispatch batch to GPU if self.use_cuda: - text_input = text_input.cuda(non_blocking=True) - text_lengths = text_lengths.cuda(non_blocking=True) - mel_input = mel_input.cuda(non_blocking=True) - mel_lengths = mel_lengths.cuda(non_blocking=True) - linear_input = linear_input.cuda(non_blocking=True) if self.config.model.lower() in ["tacotron"] else None - stop_targets = stop_targets.cuda(non_blocking=True) - attn_mask = attn_mask.cuda(non_blocking=True) if attn_mask is not None else None - durations = durations.cuda(non_blocking=True) if attn_mask is not None else None + text_input = to_cuda(text_input) + text_lengths = to_cuda(text_lengths) + mel_input = to_cuda(mel_input) + mel_lengths = to_cuda(mel_lengths) + linear_input = to_cuda(linear_input) if self.config.model.lower() in ["tacotron"] else None + stop_targets = to_cuda(stop_targets) + attn_mask = to_cuda(attn_mask) if attn_mask is not None else None + durations = to_cuda(durations) if attn_mask is not None else None if speaker_ids is not None: - speaker_ids = speaker_ids.cuda(non_blocking=True) + speaker_ids = to_cuda(speaker_ids) if speaker_embeddings is not None: - speaker_embeddings = speaker_embeddings.cuda(non_blocking=True) + speaker_embeddings = to_cuda(speaker_embeddings) return { "text_input": text_input, diff --git a/TTS/tts/datasets/TTSDataset.py b/TTS/tts/datasets/TTSDataset.py index cbb0a593..76f82c97 100644 --- a/TTS/tts/datasets/TTSDataset.py +++ b/TTS/tts/datasets/TTSDataset.py @@ -282,7 +282,7 @@ class TTSDataset(Dataset): """ # Puts each data field into a tensor with outer dimension batch size - if isinstance(batch[0], collections.Mapping): + if isinstance(batch[0], collections.abc.Mapping): text_lenghts = np.array([len(d["text"]) for d in batch]) diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 0c28116d..a1abf5fe 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -13,6 +13,15 @@ from typing import Dict import torch +def to_cuda(x: torch.Tensor) -> torch.Tensor: + if x is None: + return None + x = x.contiguous() + if torch.cuda.is_available(): + x = x.cuda(non_blocking=True) + return x + + def get_cuda(): use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") From f00ef90ce6b0d5748b379920d3d9d62f91d09adc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 3 Jun 2021 11:42:40 +0200 Subject: [PATCH 063/258] rename external speaker embedding arguments as `d_vectors` --- TTS/bin/extract_tts_spectrograms.py | 23 ++-- TTS/bin/synthesize.py | 2 +- TTS/trainer.py | 26 ++-- TTS/tts/layers/tacotron/gst_layers.py | 10 +- TTS/tts/layers/tacotron/tacotron.py | 2 +- TTS/tts/models/__init__.py | 8 +- TTS/tts/models/align_tts.py | 12 +- TTS/tts/models/glow_tts.py | 42 +++---- TTS/tts/models/speedy_speech.py | 12 +- TTS/tts/models/tacotron.py | 44 +++---- TTS/tts/models/tacotron2.py | 42 +++---- TTS/tts/models/tacotron_abstract.py | 40 +++---- TTS/tts/utils/speakers.py | 118 +++++++++---------- TTS/tts/utils/synthesis.py | 24 ++-- TTS/utils/synthesizer.py | 18 +-- tests/test_extract_tts_spectrograms.py | 6 +- tests/test_speaker_manager.py | 54 ++++----- tests/tts_tests/test_speedy_speech_layers.py | 4 +- tests/tts_tests/test_tacotron2_model.py | 8 +- tests/tts_tests/test_tacotron_model.py | 8 +- 20 files changed, 251 insertions(+), 252 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 64abc719..5137d48a 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -108,9 +108,8 @@ def format_data(data): mel_lengths = mel_lengths.cuda(non_blocking=True) if speaker_ids is not None: speaker_ids = speaker_ids.cuda(non_blocking=True) - if speaker_embeddings is not None: - speaker_embeddings = speaker_embeddings.cuda(non_blocking=True) - + if d_vectors is not None: + d_vectors = d_vectors.cuda(non_blocking=True) if attn_mask is not None: attn_mask = attn_mask.cuda(non_blocking=True) return ( @@ -119,7 +118,7 @@ def format_data(data): mel_input, mel_lengths, speaker_ids, - speaker_embeddings, + d_vectors, avg_text_length, avg_spec_length, attn_mask, @@ -137,23 +136,23 @@ def inference( mel_input, mel_lengths, speaker_ids=None, - speaker_embeddings=None, + d_vectors=None, ): if model_name == "glow_tts": speaker_c = None if speaker_ids is not None: speaker_c = speaker_ids - elif speaker_embeddings is not None: - speaker_c = speaker_embeddings + elif d_vectors is not None: + speaker_c = d_vectors outputs = model.inference_with_MAS( - text_input, text_lengths, mel_input, mel_lengths, cond_input={"x_vectors": speaker_c} + text_input, text_lengths, mel_input, mel_lengths, cond_input={"d_vectors": speaker_c} ) model_output = outputs["model_outputs"] model_output = model_output.transpose(1, 2).detach().cpu().numpy() elif "tacotron" in model_name: - cond_input = {"speaker_ids": speaker_ids, "x_vectors": speaker_embeddings} + cond_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} outputs = model(text_input, text_lengths, mel_input, mel_lengths, cond_input) postnet_outputs = outputs["model_outputs"] # normalize tacotron output @@ -184,7 +183,7 @@ def extract_spectrograms( mel_input, mel_lengths, speaker_ids, - speaker_embeddings, + d_vectors, _, _, _, @@ -200,7 +199,7 @@ def extract_spectrograms( mel_input, mel_lengths, speaker_ids, - speaker_embeddings, + d_vectors, ) for idx in range(text_input.shape[0]): @@ -256,7 +255,7 @@ def main(args): # pylint: disable=redefined-outer-name speaker_manager = get_speaker_manager(c, args, meta_data_train) # setup model - model = setup_model(num_chars, speaker_manager.num_speakers, c, speaker_embedding_dim=speaker_manager.x_vector_dim) + model = setup_model(num_chars, speaker_manager.num_speakers, c, d_vector_dim=speaker_manager.d_vector_dim) # restore model checkpoint = torch.load(args.checkpoint_path, map_location="cpu") diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index a5066e3d..3cde5612 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -157,7 +157,7 @@ def main(): parser.add_argument( "--speaker_wav", nargs="+", - help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The x_vectors is computed as their average.", + help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.", default=None, ) parser.add_argument("--gst_style", help="Wav path file for GST stylereference.", default=None) diff --git a/TTS/trainer.py b/TTS/trainer.py index 8ec59f55..55560624 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -113,7 +113,7 @@ class TrainerTTS: len(self.model_characters), self.speaker_manager.num_speakers, self.config, - self.speaker_manager.x_vector_dim if self.speaker_manager.x_vectors else None, + self.speaker_manager.d_vector_dim if self.speaker_manager.d_vectors else None, ) # setup criterion @@ -156,8 +156,8 @@ class TrainerTTS: print("\n > Model has {} parameters".format(num_params)) @staticmethod - def get_model(num_chars: int, num_speakers: int, config: Coqpit, x_vector_dim: int) -> nn.Module: - model = setup_model(num_chars, num_speakers, config, x_vector_dim) + def get_model(num_chars: int, num_speakers: int, config: Coqpit, d_vector_dim: int) -> nn.Module: + model = setup_model(num_chars, num_speakers, config, d_vector_dim) return model @staticmethod @@ -196,11 +196,11 @@ class TrainerTTS: speakers_file = config.external_speaker_embedding_file if config.use_external_speaker_embedding_file: - speaker_manager.load_x_vectors_file(speakers_file) + speaker_manager.load_d_vectors_file(speakers_file) else: speaker_manager.load_ids_file(speakers_file) elif config.use_external_speaker_embedding_file and config.external_speaker_embedding_file: - speaker_manager.load_x_vectors_file(config.external_speaker_embedding_file) + speaker_manager.load_d_vectors_file(config.external_speaker_embedding_file) else: speaker_manager.parse_speakers_from_items(data_train) file_path = os.path.join(out_path, "speakers.json") @@ -387,8 +387,8 @@ class TrainerTTS: durations = to_cuda(durations) if attn_mask is not None else None if speaker_ids is not None: speaker_ids = to_cuda(speaker_ids) - if speaker_embeddings is not None: - speaker_embeddings = to_cuda(speaker_embeddings) + if d_vectors is not None: + d_vectors = to_cuda(d_vectors) return { "text_input": text_input, @@ -400,7 +400,7 @@ class TrainerTTS: "attn_mask": attn_mask, "durations": durations, "speaker_ids": speaker_ids, - "x_vectors": speaker_embeddings, + "d_vectors": d_vectors, "max_text_length": max_text_length, "max_spec_length": max_spec_length, "item_idx": item_idx, @@ -591,7 +591,7 @@ class TrainerTTS: self.use_cuda, self.ap, speaker_id=cond_inputs["speaker_id"], - x_vector=cond_inputs["x_vector"], + d_vector=cond_inputs["d_vector"], style_wav=cond_inputs["style_wav"], enable_eos_bos_chars=self.config.enable_eos_bos_chars, use_griffin_lim=True, @@ -612,9 +612,9 @@ class TrainerTTS: def _get_cond_inputs(self) -> Dict: # setup speaker_id speaker_id = 0 if self.config.use_speaker_embedding else None - # setup x_vector - x_vector = ( - self.speaker_manager.get_x_vectors_by_speaker(self.speaker_manager.speaker_ids[0]) + # setup d_vector + d_vector = ( + self.speaker_manager.get_d_vectors_by_speaker(self.speaker_manager.speaker_ids[0]) if self.config.use_external_speaker_embedding_file and self.config.use_speaker_embedding else None ) @@ -629,7 +629,7 @@ class TrainerTTS: print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") for i in range(self.config.gst["gst_num_style_tokens"]): style_wav[str(i)] = 0 - cond_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "x_vector": x_vector} + cond_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector} return cond_inputs def fit(self) -> None: diff --git a/TTS/tts/layers/tacotron/gst_layers.py b/TTS/tts/layers/tacotron/gst_layers.py index e2784e5d..02154093 100644 --- a/TTS/tts/layers/tacotron/gst_layers.py +++ b/TTS/tts/layers/tacotron/gst_layers.py @@ -8,10 +8,10 @@ class GST(nn.Module): See https://arxiv.org/pdf/1803.09017""" - def __init__(self, num_mel, num_heads, num_style_tokens, gst_embedding_dim, speaker_embedding_dim=None): + def __init__(self, num_mel, num_heads, num_style_tokens, gst_embedding_dim, d_vector_dim=None): super().__init__() self.encoder = ReferenceEncoder(num_mel, gst_embedding_dim) - self.style_token_layer = StyleTokenLayer(num_heads, num_style_tokens, gst_embedding_dim, speaker_embedding_dim) + self.style_token_layer = StyleTokenLayer(num_heads, num_style_tokens, gst_embedding_dim, d_vector_dim) def forward(self, inputs, speaker_embedding=None): enc_out = self.encoder(inputs) @@ -83,13 +83,13 @@ class ReferenceEncoder(nn.Module): class StyleTokenLayer(nn.Module): """NN Module attending to style tokens based on prosody encodings.""" - def __init__(self, num_heads, num_style_tokens, embedding_dim, speaker_embedding_dim=None): + def __init__(self, num_heads, num_style_tokens, embedding_dim, d_vector_dim=None): super().__init__() self.query_dim = embedding_dim // 2 - if speaker_embedding_dim: - self.query_dim += speaker_embedding_dim + if d_vector_dim: + self.query_dim += d_vector_dim self.key_dim = embedding_dim // num_heads self.style_tokens = nn.Parameter(torch.FloatTensor(num_style_tokens, self.key_dim)) diff --git a/TTS/tts/layers/tacotron/tacotron.py b/TTS/tts/layers/tacotron/tacotron.py index dc38173f..2f94db88 100644 --- a/TTS/tts/layers/tacotron/tacotron.py +++ b/TTS/tts/layers/tacotron/tacotron.py @@ -266,7 +266,7 @@ class Decoder(nn.Module): location_attn (bool): if true, use location sensitive attention. attn_K (int): number of attention heads for GravesAttention. separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow. - speaker_embedding_dim (int): size of speaker embedding vector, for multi-speaker training. + d_vector_dim (int): size of speaker embedding vector, for multi-speaker training. """ # Pylint gets confused by PyTorch conventions here diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py index 153f8d43..026f5c85 100644 --- a/TTS/tts/models/__init__.py +++ b/TTS/tts/models/__init__.py @@ -1,7 +1,7 @@ from TTS.utils.generic_utils import find_module -def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): +def setup_model(num_chars, num_speakers, c, d_vector_dim=None): print(" > Using model: {}".format(c.model)) MyModel = find_module("TTS.tts.models", c.model.lower()) if c.model.lower() in "tacotron": @@ -29,7 +29,7 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): bidirectional_decoder=c.bidirectional_decoder, double_decoder_consistency=c.double_decoder_consistency, ddc_r=c.ddc_r, - speaker_embedding_dim=speaker_embedding_dim, + d_vector_dim=d_vector_dim, ) elif c.model.lower() == "tacotron2": model = MyModel( @@ -55,7 +55,7 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): bidirectional_decoder=c.bidirectional_decoder, double_decoder_consistency=c.double_decoder_consistency, ddc_r=c.ddc_r, - speaker_embedding_dim=speaker_embedding_dim, + d_vector_dim=d_vector_dim, ) elif c.model.lower() == "glow_tts": model = MyModel( @@ -79,7 +79,7 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): num_squeeze=2, sigmoid_scale=False, mean_only=True, - speaker_embedding_dim=speaker_embedding_dim, + d_vector_dim=d_vector_dim, ) elif c.model.lower() == "speedy_speech": model = MyModel( diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 3e8d4adc..20b0cdf7 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -212,7 +212,7 @@ class AlignTTS(nn.Module): return dr_mas, mu, log_sigma, logp def forward( - self, x, x_lengths, y, y_lengths, cond_input={"x_vectors": None}, phase=None + self, x, x_lengths, y, y_lengths, cond_input={"d_vectors": None}, phase=None ): # pylint: disable=unused-argument """ Shapes: @@ -223,7 +223,7 @@ class AlignTTS(nn.Module): g: [B, C] """ y = y.transpose(1, 2) - g = cond_input["x_vectors"] if "x_vectors" in cond_input else None + g = cond_input["d_vectors"] if "d_vectors" in cond_input else None o_de, o_dr_log, dr_mas_log, attn, mu, log_sigma, logp = None, None, None, None, None, None, None if phase == 0: # train encoder and MDN @@ -267,14 +267,14 @@ class AlignTTS(nn.Module): return outputs @torch.no_grad() - def inference(self, x, cond_input={"x_vectors": None}): # pylint: disable=unused-argument + def inference(self, x, cond_input={"d_vectors": None}): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] x_lengths: [B] g: [B, C] """ - g = cond_input["x_vectors"] if "x_vectors" in cond_input else None + g = cond_input["d_vectors"] if "d_vectors" in cond_input else None x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # pad input to prevent dropping the last word # x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0) @@ -293,10 +293,10 @@ class AlignTTS(nn.Module): text_lengths = batch["text_lengths"] mel_input = batch["mel_input"] mel_lengths = batch["mel_lengths"] - x_vectors = batch["x_vectors"] + d_vectors = batch["d_vectors"] speaker_ids = batch["speaker_ids"] - cond_input = {"x_vectors": x_vectors, "speaker_ids": speaker_ids} + cond_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids} outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input, self.phase) loss_dict = criterion( outputs["logp"], diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index af52ba1c..9c928a67 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -36,7 +36,7 @@ class GlowTTS(nn.Module): mean_only (bool): if True, encoder only computes mean value and uses constant variance for each time step. encoder_type (str): encoder module type. encoder_params (dict): encoder module parameters. - speaker_embedding_dim (int): channels of external speaker embedding vectors. + d_vector_dim (int): channels of external speaker embedding vectors. """ def __init__( @@ -62,7 +62,7 @@ class GlowTTS(nn.Module): mean_only=False, encoder_type="transformer", encoder_params=None, - speaker_embedding_dim=None, + d_vector_dim=None, ): super().__init__() @@ -88,15 +88,15 @@ class GlowTTS(nn.Module): # model constants. self.noise_scale = 0.33 # defines the noise variance applied to the random z vector at inference. self.length_scale = 1.0 # scaler for the duration predictor. The larger it is, the slower the speech. - self.speaker_embedding_dim = speaker_embedding_dim + self.d_vector_dim = d_vector_dim # if is a multispeaker and c_in_channels is 0, set to 256 if num_speakers > 1: - if self.c_in_channels == 0 and not self.speaker_embedding_dim: + if self.c_in_channels == 0 and not self.d_vector_dim: # TODO: make this adjustable self.c_in_channels = 256 - elif self.speaker_embedding_dim: - self.c_in_channels = self.speaker_embedding_dim + elif self.d_vector_dim: + self.c_in_channels = self.d_vector_dim self.encoder = Encoder( num_chars, @@ -125,7 +125,7 @@ class GlowTTS(nn.Module): c_in_channels=self.c_in_channels, ) - if num_speakers > 1 and not speaker_embedding_dim: + if num_speakers > 1 and not d_vector_dim: # speaker embedding layer self.emb_g = nn.Embedding(num_speakers, self.c_in_channels) nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) @@ -144,7 +144,7 @@ class GlowTTS(nn.Module): return y_mean, y_log_scale, o_attn_dur def forward( - self, x, x_lengths, y, y_lengths=None, cond_input={"x_vectors": None} + self, x, x_lengths, y, y_lengths=None, cond_input={"d_vectors": None} ): # pylint: disable=dangerous-default-value """ Shapes: @@ -157,9 +157,9 @@ class GlowTTS(nn.Module): y = y.transpose(1, 2) y_max_length = y.size(2) # norm speaker embeddings - g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None + g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None if g is not None: - if self.speaker_embedding_dim: + if self.d_vector_dim: g = F.normalize(g).unsqueeze(-1) else: g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] @@ -197,7 +197,7 @@ class GlowTTS(nn.Module): @torch.no_grad() def inference_with_MAS( - self, x, x_lengths, y=None, y_lengths=None, cond_input={"x_vectors": None} + self, x, x_lengths, y=None, y_lengths=None, cond_input={"d_vectors": None} ): # pylint: disable=dangerous-default-value """ It's similar to the teacher forcing in Tacotron. @@ -212,9 +212,9 @@ class GlowTTS(nn.Module): y = y.transpose(1, 2) y_max_length = y.size(2) # norm speaker embeddings - g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None + g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None if g is not None: - if self.external_speaker_embedding_dim: + if self.external_d_vector_dim: g = F.normalize(g).unsqueeze(-1) else: g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] @@ -258,7 +258,7 @@ class GlowTTS(nn.Module): @torch.no_grad() def decoder_inference( - self, y, y_lengths=None, cond_input={"x_vectors": None} + self, y, y_lengths=None, cond_input={"d_vectors": None} ): # pylint: disable=dangerous-default-value """ Shapes: @@ -268,10 +268,10 @@ class GlowTTS(nn.Module): """ y = y.transpose(1, 2) y_max_length = y.size(2) - g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None + g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None # norm speaker embeddings if g is not None: - if self.external_speaker_embedding_dim: + if self.external_d_vector_dim: g = F.normalize(g).unsqueeze(-1) else: g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] @@ -290,10 +290,10 @@ class GlowTTS(nn.Module): return outputs @torch.no_grad() - def inference(self, x, x_lengths, cond_input={"x_vectors": None}): # pylint: disable=dangerous-default-value - g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None + def inference(self, x, x_lengths, cond_input={"d_vectors": None}): # pylint: disable=dangerous-default-value + g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None if g is not None: - if self.speaker_embedding_dim: + if self.d_vector_dim: g = F.normalize(g).unsqueeze(-1) else: g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h] @@ -338,9 +338,9 @@ class GlowTTS(nn.Module): text_lengths = batch["text_lengths"] mel_input = batch["mel_input"] mel_lengths = batch["mel_lengths"] - x_vectors = batch["x_vectors"] + d_vectors = batch["d_vectors"] - outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input={"x_vectors": x_vectors}) + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input={"d_vectors": d_vectors}) loss_dict = criterion( outputs["model_outputs"], diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index 455dbf38..53f7bbaa 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -157,7 +157,7 @@ class SpeedySpeech(nn.Module): return o_de, attn.transpose(1, 2) def forward( - self, x, x_lengths, y_lengths, dr, cond_input={"x_vectors": None, "speaker_ids": None} + self, x, x_lengths, y_lengths, dr, cond_input={"d_vectors": None, "speaker_ids": None} ): # pylint: disable=unused-argument """ TODO: speaker embedding for speaker_ids @@ -168,21 +168,21 @@ class SpeedySpeech(nn.Module): dr: [B, T_max] g: [B, C] """ - g = cond_input["x_vectors"] if "x_vectors" in cond_input else None + g = cond_input["d_vectors"] if "d_vectors" in cond_input else None o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) o_de, attn = self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g) outputs = {"model_outputs": o_de.transpose(1, 2), "durations_log": o_dr_log.squeeze(1), "alignments": attn} return outputs - def inference(self, x, cond_input={"x_vectors": None, "speaker_ids": None}): # pylint: disable=unused-argument + def inference(self, x, cond_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] x_lengths: [B] g: [B, C] """ - g = cond_input["x_vectors"] if "x_vectors" in cond_input else None + g = cond_input["d_vectors"] if "d_vectors" in cond_input else None x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # input sequence should be greated than the max convolution size inference_padding = 5 @@ -204,11 +204,11 @@ class SpeedySpeech(nn.Module): text_lengths = batch["text_lengths"] mel_input = batch["mel_input"] mel_lengths = batch["mel_lengths"] - x_vectors = batch["x_vectors"] + d_vectors = batch["d_vectors"] speaker_ids = batch["speaker_ids"] durations = batch["durations"] - cond_input = {"x_vectors": x_vectors, "speaker_ids": speaker_ids} + cond_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids} outputs = self.forward(text_input, text_lengths, mel_lengths, durations, cond_input) # compute loss diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 12c3e5f9..123b69a7 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -42,7 +42,7 @@ class Tacotron(TacotronAbstract): ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None. encoder_in_features (int, optional): input channels for the encoder. Defaults to 512. decoder_in_features (int, optional): input channels for the decoder. Defaults to 512. - speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None. + d_vector_dim (int, optional): external speaker conditioning vector channels. Defaults to None. use_gst (bool, optional): enable/disable Global style token module. gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. memory_size (int, optional): size of the history queue fed to the prenet. Model feeds the last ```memory_size``` @@ -75,7 +75,7 @@ class Tacotron(TacotronAbstract): ddc_r=None, encoder_in_features=256, decoder_in_features=256, - speaker_embedding_dim=None, + d_vector_dim=None, use_gst=False, gst=None, memory_size=5, @@ -104,7 +104,7 @@ class Tacotron(TacotronAbstract): ddc_r, encoder_in_features, decoder_in_features, - speaker_embedding_dim, + d_vector_dim, use_gst, gst, gradual_training, @@ -112,14 +112,14 @@ class Tacotron(TacotronAbstract): # speaker embedding layers if self.num_speakers > 1: - if not self.embeddings_per_sample: - speaker_embedding_dim = 256 - self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) + if not self.use_d_vectors: + d_vector_dim = 256 + self.speaker_embedding = nn.Embedding(self.num_speakers, d_vector_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) # speaker and gst embeddings is concat in decoder input if self.num_speakers > 1: - self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim + self.decoder_in_features += d_vector_dim # add speaker embedding dim # embedding layer self.embedding = nn.Embedding(num_chars, 256, padding_idx=0) @@ -154,7 +154,7 @@ class Tacotron(TacotronAbstract): if self.gst and self.use_gst: self.gst_layer = GST( num_mel=decoder_output_dim, - speaker_embedding_dim=speaker_embedding_dim, + d_vector_dim=d_vector_dim, num_heads=gst.gst_num_heads, num_style_tokens=gst.gst_num_style_tokens, gst_embedding_dim=gst.gst_embedding_dim, @@ -189,7 +189,7 @@ class Tacotron(TacotronAbstract): text_lengths: [B] mel_specs: [B, T_out, C] mel_lengths: [B] - cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] + cond_input: 'speaker_ids': [B, 1] and 'd_vectors':[B, C] """ outputs = {"alignments_backward": None, "decoder_outputs_backward": None} inputs = self.embedding(text) @@ -201,16 +201,16 @@ class Tacotron(TacotronAbstract): # global style token if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["x_vectors"]) + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["d_vectors"]) # speaker embedding if self.num_speakers > 1: - if not self.embeddings_per_sample: + if not self.use_d_vectors: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(cond_input["speaker_ids"])[:, None] + embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[:, None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(cond_input["x_vectors"], 1) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + embedded_speakers = torch.unsqueeze(cond_input["d_vectors"], 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) # decoder_outputs: B x decoder_in_features x T_out # alignments: B x T_in x encoder_in_features # stop_tokens: B x T_in @@ -254,15 +254,15 @@ class Tacotron(TacotronAbstract): encoder_outputs = self.encoder(inputs) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["x_vectors"]) + encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["d_vectors"]) if self.num_speakers > 1: - if not self.embeddings_per_sample: + if not self.use_d_vectors: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(cond_input["speaker_ids"])[:, None] + embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[:, None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(cond_input["x_vectors"], 1) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + embedded_speakers = torch.unsqueeze(cond_input["d_vectors"], 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = self.last_linear(postnet_outputs) @@ -289,7 +289,7 @@ class Tacotron(TacotronAbstract): linear_input = batch["linear_input"] stop_targets = batch["stop_targets"] speaker_ids = batch["speaker_ids"] - x_vectors = batch["x_vectors"] + d_vectors = batch["d_vectors"] # forward pass model outputs = self.forward( @@ -297,7 +297,7 @@ class Tacotron(TacotronAbstract): text_lengths, mel_input, mel_lengths, - cond_input={"speaker_ids": speaker_ids, "x_vectors": x_vectors}, + cond_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors}, ) # set the [alignment] lengths wrt reduction factor for guided attention @@ -308,7 +308,7 @@ class Tacotron(TacotronAbstract): else: alignment_lengths = mel_lengths // self.decoder.r - cond_input = {"speaker_ids": speaker_ids, "x_vectors": x_vectors} + cond_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input) # compute loss diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 68867ec8..4628c64e 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -42,7 +42,7 @@ class Tacotron2(TacotronAbstract): ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None. encoder_in_features (int, optional): input channels for the encoder. Defaults to 512. decoder_in_features (int, optional): input channels for the decoder. Defaults to 512. - speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None. + d_vector_dim (int, optional): external speaker conditioning vector channels. Defaults to None. use_gst (bool, optional): enable/disable Global style token module. gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. gradual_training (List): Gradual training schedule. If None or `[]`, no gradual training is used. @@ -73,7 +73,7 @@ class Tacotron2(TacotronAbstract): ddc_r=None, encoder_in_features=512, decoder_in_features=512, - speaker_embedding_dim=None, + d_vector_dim=None, use_gst=False, gst=None, gradual_training=None, @@ -101,7 +101,7 @@ class Tacotron2(TacotronAbstract): ddc_r, encoder_in_features, decoder_in_features, - speaker_embedding_dim, + d_vector_dim, use_gst, gst, gradual_training, @@ -109,14 +109,14 @@ class Tacotron2(TacotronAbstract): # speaker embedding layer if self.num_speakers > 1: - if not self.embeddings_per_sample: - speaker_embedding_dim = 512 - self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) + if not self.use_d_vectors: + d_vector_dim = 512 + self.speaker_embedding = nn.Embedding(self.num_speakers, d_vector_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) # speaker and gst embeddings is concat in decoder input if self.num_speakers > 1: - self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim + self.decoder_in_features += d_vector_dim # add speaker embedding dim # embedding layer self.embedding = nn.Embedding(num_chars, 512, padding_idx=0) @@ -142,13 +142,13 @@ class Tacotron2(TacotronAbstract): self.postnet = Postnet(self.postnet_output_dim) # setup prenet dropout - self.decoder.prenet.dropout_at_inference = prenet_dropout_at_inference + self.decoder.prenet.dropout_at_g = prenet_dropout_at_inference # global style token layers if self.gst and use_gst: self.gst_layer = GST( num_mel=decoder_output_dim, - speaker_embedding_dim=speaker_embedding_dim, + d_vector_dim=d_vector_dim, num_heads=gst.gst_num_heads, num_style_tokens=gst.gst_num_style_tokens, gst_embedding_dim=gst.gst_embedding_dim, @@ -189,7 +189,7 @@ class Tacotron2(TacotronAbstract): text_lengths: [B] mel_specs: [B, T_out, C] mel_lengths: [B] - cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] + cond_input: 'speaker_ids': [B, 1] and 'd_vectors':[B, C] """ cond_input = self._format_cond_input(cond_input) outputs = {"alignments_backward": None, "decoder_outputs_backward": None} @@ -202,15 +202,15 @@ class Tacotron2(TacotronAbstract): encoder_outputs = self.encoder(embedded_inputs, text_lengths) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["x_vectors"]) + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["d_vectors"]) if self.num_speakers > 1: - if not self.embeddings_per_sample: + if not self.use_d_vectors: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(cond_input["speaker_ids"])[:, None] + embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[:, None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(cond_input["x_vectors"], 1) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + embedded_speakers = torch.unsqueeze(cond_input["d_vectors"], 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) @@ -255,15 +255,15 @@ class Tacotron2(TacotronAbstract): if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["x_vectors"]) + encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["d_vectors"]) if self.num_speakers > 1: if not self.embeddings_per_sample: x_vector = self.speaker_embedding(cond_input['speaker_ids'])[:, None] x_vector = torch.unsqueeze(x_vector, 0).transpose(1, 2) else: - x_vector = cond_input["x_vectors"] + embedded_speakers = cond_input["d_vectors"] - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, x_vector) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) postnet_outputs = self.postnet(decoder_outputs) @@ -291,7 +291,7 @@ class Tacotron2(TacotronAbstract): linear_input = batch["linear_input"] stop_targets = batch["stop_targets"] speaker_ids = batch["speaker_ids"] - x_vectors = batch["x_vectors"] + d_vectors = batch["d_vectors"] # forward pass model outputs = self.forward( @@ -299,7 +299,7 @@ class Tacotron2(TacotronAbstract): text_lengths, mel_input, mel_lengths, - cond_input={"speaker_ids": speaker_ids, "x_vectors": x_vectors}, + cond_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors}, ) # set the [alignment] lengths wrt reduction factor for guided attention @@ -310,7 +310,7 @@ class Tacotron2(TacotronAbstract): else: alignment_lengths = mel_lengths // self.decoder.r - cond_input = {"speaker_ids": speaker_ids, "x_vectors": x_vectors} + cond_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input) # compute loss diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py index fe43d81f..e480e2e0 100644 --- a/TTS/tts/models/tacotron_abstract.py +++ b/TTS/tts/models/tacotron_abstract.py @@ -35,7 +35,7 @@ class TacotronAbstract(ABC, nn.Module): ddc_r=None, encoder_in_features=512, decoder_in_features=512, - speaker_embedding_dim=None, + d_vector_dim=None, use_gst=False, gst=None, gradual_training=None, @@ -66,7 +66,7 @@ class TacotronAbstract(ABC, nn.Module): self.separate_stopnet = separate_stopnet self.encoder_in_features = encoder_in_features self.decoder_in_features = decoder_in_features - self.speaker_embedding_dim = speaker_embedding_dim + self.d_vector_dim = d_vector_dim self.gradual_training = gradual_training # layers @@ -76,12 +76,12 @@ class TacotronAbstract(ABC, nn.Module): self.postnet = None # multispeaker - if self.speaker_embedding_dim is None: - # if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim - self.embeddings_per_sample = False + if self.d_vector_dim is None: + # if d_vector_dim is None we need use the nn.Embedding, with default d_vector_dim + self.use_d_vectors = False else: - # if speaker_embedding_dim is not None we need use speaker embedding per sample - self.embeddings_per_sample = True + # if d_vector_dim is not None we need use speaker embedding per sample + self.use_d_vectors = True # global style token if self.gst and use_gst: @@ -89,8 +89,8 @@ class TacotronAbstract(ABC, nn.Module): self.gst_layer = None # model states - self.speaker_embeddings = None - self.speaker_embeddings_projected = None + self.embedded_speakers = None + self.embedded_speakers_projected = None # additional layers self.decoder_backward = None @@ -98,15 +98,15 @@ class TacotronAbstract(ABC, nn.Module): @staticmethod def _format_cond_input(cond_input: Dict) -> Dict: - return format_cond_input({"x_vectors": None, "speaker_ids": None}, cond_input) + return format_cond_input({"d_vectors": None, "speaker_ids": None}, cond_input) ############################# # INIT FUNCTIONS ############################# def _init_states(self): - self.speaker_embeddings = None - self.speaker_embeddings_projected = None + self.embedded_speakers = None + self.embedded_speakers_projected = None def _init_backward_decoder(self): self.decoder_backward = copy.deepcopy(self.decoder) @@ -188,9 +188,9 @@ class TacotronAbstract(ABC, nn.Module): if hasattr(self, "speaker_embedding") and speaker_ids is None: raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided") if hasattr(self, "speaker_embedding") and speaker_ids is not None: - self.speaker_embeddings = self.speaker_embedding(speaker_ids).unsqueeze(1) + self.embedded_speakers = self.speaker_embedding(speaker_ids).unsqueeze(1) if hasattr(self, "speaker_project_mel") and speaker_ids is not None: - self.speaker_embeddings_projected = self.speaker_project_mel(self.speaker_embeddings).squeeze(1) + self.embedded_speakers_projected = self.speaker_project_mel(self.embedded_speakers).squeeze(1) def compute_gst(self, inputs, style_input, speaker_embedding=None): """Compute global style token""" @@ -213,15 +213,15 @@ class TacotronAbstract(ABC, nn.Module): return inputs @staticmethod - def _add_speaker_embedding(outputs, speaker_embeddings): - speaker_embeddings_ = speaker_embeddings.expand(outputs.size(0), outputs.size(1), -1) - outputs = outputs + speaker_embeddings_ + def _add_speaker_embedding(outputs, embedded_speakers): + embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1) + outputs = outputs + embedded_speakers_ return outputs @staticmethod - def _concat_speaker_embedding(outputs, speaker_embeddings): - speaker_embeddings_ = speaker_embeddings.expand(outputs.size(0), outputs.size(1), -1) - outputs = torch.cat([outputs, speaker_embeddings_], dim=-1) + def _concat_speaker_embedding(outputs, embedded_speakers): + embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1) + outputs = torch.cat([outputs, embedded_speakers_], dim=-1) return outputs ############################# diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index cebf0dca..546d483d 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -52,8 +52,8 @@ def get_speaker_manager(c, args, meta_data_train): raise RuntimeError( "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file" ) - speaker_manager.load_x_vectors_file(c.external_speaker_embedding_file) - speaker_manager.set_x_vectors_from_file(speakers_file) + speaker_manager.load_d_vectors_file(c.external_speaker_embedding_file) + speaker_manager.set_d_vectors_from_file(speakers_file) elif not c.use_external_speaker_embedding_file: # restor speaker manager with speaker ID file. speakers_file = os.path.dirname(args.restore_path) speaker_ids_from_data = speaker_manager.speaker_ids @@ -63,7 +63,7 @@ def get_speaker_manager(c, args, meta_data_train): ), " [!] You cannot introduce new speakers to a pre-trained model." elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # new speaker manager with external speaker embeddings. - speaker_manager.set_x_vectors_from_file(c.external_speaker_embedding_file) + speaker_manager.set_d_vectors_from_file(c.external_speaker_embedding_file) elif ( c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file ): # new speaker manager with speaker IDs file. @@ -88,7 +88,7 @@ class SpeakerManager: { 'clip_name.wav':{ 'name': 'speakerA', - 'embedding'[] + 'embedding'[] }, ... } @@ -103,10 +103,10 @@ class SpeakerManager: >>> # load a sample audio and compute embedding >>> waveform = ap.load_wav(sample_wav_path) >>> mel = ap.melspectrogram(waveform) - >>> x_vector = manager.compute_x_vector(mel.T) + >>> d_vector = manager.compute_d_vector(mel.T) Args: - x_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "". + d_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "". speaker_id_file_path (str, optional): Path to the metafile that maps speaker names to ids used by TTS models. Defaults to "". encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "". @@ -116,15 +116,15 @@ class SpeakerManager: def __init__( self, data_items: List[List[Any]] = None, - x_vectors_file_path: str = "", + d_vectors_file_path: str = "", speaker_id_file_path: str = "", encoder_model_path: str = "", encoder_config_path: str = "", ): self.data_items = [] - self.x_vectors = {} - self.speaker_ids = [] + self.d_vectors = {} + self.speaker_ids = {} self.clip_ids = [] self.speaker_encoder = None self.speaker_encoder_ap = None @@ -132,8 +132,8 @@ class SpeakerManager: if data_items: self.speaker_ids, _ = self.parse_speakers_from_data(self.data_items) - if x_vectors_file_path: - self.set_x_vectors_from_file(x_vectors_file_path) + if d_vectors_file_path: + self.set_d_vectors_from_file(d_vectors_file_path) if speaker_id_file_path: self.set_speaker_ids_from_file(speaker_id_file_path) @@ -156,10 +156,10 @@ class SpeakerManager: return len(self.speaker_ids) @property - def x_vector_dim(self): - """Dimensionality of x_vectors. If x_vectors are not loaded, returns zero.""" - if self.x_vectors: - return len(self.x_vectors[list(self.x_vectors.keys())[0]]["embedding"]) + def d_vector_dim(self): + """Dimensionality of d_vectors. If d_vectors are not loaded, returns zero.""" + if self.d_vectors: + return len(self.d_vectors[list(self.d_vectors.keys())[0]]["embedding"]) return 0 @staticmethod @@ -201,73 +201,73 @@ class SpeakerManager: """ self._save_json(file_path, self.speaker_ids) - def save_x_vectors_to_file(self, file_path: str) -> None: - """Save x_vectors to a json file. + def save_d_vectors_to_file(self, file_path: str) -> None: + """Save d_vectors to a json file. Args: file_path (str): Path to the output file. """ - self._save_json(file_path, self.x_vectors) + self._save_json(file_path, self.d_vectors) - def set_x_vectors_from_file(self, file_path: str) -> None: - """Load x_vectors from a json file. + def set_d_vectors_from_file(self, file_path: str) -> None: + """Load d_vectors from a json file. Args: file_path (str): Path to the target json file. """ - self.x_vectors = self._load_json(file_path) - self.speaker_ids = list(set(sorted(x["name"] for x in self.x_vectors.values()))) - self.clip_ids = list(set(sorted(clip_name for clip_name in self.x_vectors.keys()))) + self.d_vectors = self._load_json(file_path) + self.speaker_ids = list(set(sorted(x["name"] for x in self.d_vectors.values()))) + self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys()))) - def get_x_vector_by_clip(self, clip_idx: str) -> List: - """Get x_vector by clip ID. + def get_d_vector_by_clip(self, clip_idx: str) -> List: + """Get d_vector by clip ID. Args: clip_idx (str): Target clip ID. Returns: - List: x_vector as a list. + List: d_vector as a list. """ - return self.x_vectors[clip_idx]["embedding"] + return self.d_vectors[clip_idx]["embedding"] - def get_x_vectors_by_speaker(self, speaker_idx: str) -> List[List]: - """Get all x_vectors of a speaker. + def get_d_vectors_by_speaker(self, speaker_idx: str) -> List[List]: + """Get all d_vectors of a speaker. Args: speaker_idx (str): Target speaker ID. Returns: - List[List]: all the x_vectors of the given speaker. + List[List]: all the d_vectors of the given speaker. """ - return [x["embedding"] for x in self.x_vectors.values() if x["name"] == speaker_idx] + return [x["embedding"] for x in self.d_vectors.values() if x["name"] == speaker_idx] - def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: - """Get mean x_vector of a speaker ID. + def get_mean_d_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: + """Get mean d_vector of a speaker ID. Args: speaker_idx (str): Target speaker ID. num_samples (int, optional): Number of samples to be averaged. Defaults to None. - randomize (bool, optional): Pick random `num_samples`of x_vectors. Defaults to False. + randomize (bool, optional): Pick random `num_samples`of d_vectors. Defaults to False. Returns: - np.ndarray: Mean x_vector. + np.ndarray: Mean d_vector. """ - x_vectors = self.get_x_vectors_by_speaker(speaker_idx) + d_vectors = self.get_d_vectors_by_speaker(speaker_idx) if num_samples is None: - x_vectors = np.stack(x_vectors).mean(0) + d_vectors = np.stack(d_vectors).mean(0) else: - assert len(x_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}" + assert len(d_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}" if randomize: - x_vectors = np.stack(random.choices(x_vectors, k=num_samples)).mean(0) + d_vectors = np.stack(random.choices(d_vectors, k=num_samples)).mean(0) else: - x_vectors = np.stack(x_vectors[:num_samples]).mean(0) - return x_vectors + d_vectors = np.stack(d_vectors[:num_samples]).mean(0) + return d_vectors def get_speakers(self) -> List: return self.speaker_ids def get_clips(self) -> List: - return sorted(self.x_vectors.keys()) + return sorted(self.d_vectors.keys()) def init_speaker_encoder(self, model_path: str, config_path: str) -> None: """Initialize a speaker encoder model. @@ -284,14 +284,14 @@ class SpeakerManager: self.speaker_encoder_ap.do_sound_norm = True self.speaker_encoder_ap.do_trim_silence = True - def compute_x_vector_from_clip(self, wav_file: Union[str, list]) -> list: - """Compute a x_vector from a given audio file. + def compute_d_vector_from_clip(self, wav_file: Union[str, list]) -> list: + """Compute a d_vector from a given audio file. Args: wav_file (Union[str, list]): Target file path. Returns: - list: Computed x_vector. + list: Computed d_vector. """ def _compute(wav_file: str): @@ -299,30 +299,30 @@ class SpeakerManager: spec = self.speaker_encoder_ap.melspectrogram(waveform) spec = torch.from_numpy(spec.T) spec = spec.unsqueeze(0) - x_vector = self.speaker_encoder.compute_embedding(spec) - return x_vector + d_vector = self.speaker_encoder.compute_embedding(spec) + return d_vector if isinstance(wav_file, list): - # compute the mean x_vector - x_vectors = None + # compute the mean d_vector + d_vectors = None for wf in wav_file: - x_vector = _compute(wf) - if x_vectors is None: - x_vectors = x_vector + d_vector = _compute(wf) + if d_vectors is None: + d_vectors = d_vector else: - x_vectors += x_vector - return (x_vectors / len(wav_file))[0].tolist() - x_vector = _compute(wav_file) - return x_vector[0].tolist() + d_vectors += d_vector + return (d_vectors / len(wav_file))[0].tolist() + d_vector = _compute(wav_file) + return d_vector[0].tolist() - def compute_x_vector(self, feats: Union[torch.Tensor, np.ndarray]) -> List: - """Compute x_vector from features. + def compute_d_vector(self, feats: Union[torch.Tensor, np.ndarray]) -> List: + """Compute d_vector from features. Args: feats (Union[torch.Tensor, np.ndarray]): Input features. Returns: - List: computed x_vector. + List: computed d_vector. """ if isinstance(feats, np.ndarray): feats = torch.from_numpy(feats) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 35b7d818..0cb8df38 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -64,9 +64,9 @@ def compute_style_mel(style_wav, ap, cuda=False): return style_mel -def run_model_torch(model, inputs, speaker_id=None, style_mel=None, x_vector=None): +def run_model_torch(model, inputs, speaker_id=None, style_mel=None, d_vector=None): outputs = model.inference( - inputs, cond_input={"speaker_ids": speaker_id, "x_vector": x_vector, "style_mel": style_mel} + inputs, cond_input={"speaker_ids": speaker_id, "d_vector": d_vector, "style_mel": style_mel} ) return outputs @@ -139,13 +139,13 @@ def speaker_id_to_torch(speaker_id, cuda=False): return speaker_id -def embedding_to_torch(x_vector, cuda=False): - if x_vector is not None: - x_vector = np.asarray(x_vector) - x_vector = torch.from_numpy(x_vector).unsqueeze(0).type(torch.FloatTensor) +def embedding_to_torch(d_vector, cuda=False): + if d_vector is not None: + d_vector = np.asarray(d_vector) + d_vector = torch.from_numpy(d_vector).unsqueeze(0).type(torch.FloatTensor) if cuda: - return x_vector.cuda() - return x_vector + return d_vector.cuda() + return d_vector # TODO: perform GL with pytorch for batching @@ -177,7 +177,7 @@ def synthesis( enable_eos_bos_chars=False, # pylint: disable=unused-argument use_griffin_lim=False, do_trim_silence=False, - x_vector=None, + d_vector=None, backend="torch", ): """Synthesize voice for the given text. @@ -209,8 +209,8 @@ def synthesis( if speaker_id is not None: speaker_id = speaker_id_to_torch(speaker_id, cuda=use_cuda) - if x_vector is not None: - x_vector = embedding_to_torch(x_vector, cuda=use_cuda) + if d_vector is not None: + d_vector = embedding_to_torch(d_vector, cuda=use_cuda) if not isinstance(style_mel, dict): style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) @@ -227,7 +227,7 @@ def synthesis( text_inputs = tf.expand_dims(text_inputs, 0) # synthesize voice if backend == "torch": - outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, x_vector=x_vector) + outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, d_vector=d_vector) model_outputs = outputs["model_outputs"] model_outputs = model_outputs[0].data.cpu().numpy() alignments = outputs["alignments"] diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index a31436d4..8f510f20 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -63,7 +63,7 @@ class Synthesizer(object): self.speaker_manager = None self.num_speakers = 0 self.tts_speakers = {} - self.speaker_embedding_dim = 0 + self.d_vector_dim = 0 self.seg = self._get_segmenter("en") self.use_cuda = use_cuda @@ -98,9 +98,9 @@ class Synthesizer(object): self.speaker_manager = SpeakerManager( encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config ) - self.speaker_manager.load_x_vectors_file(self.tts_config.get("external_speaker_embedding_file", speaker_file)) + self.speaker_manager.load_d_vectors_file(self.tts_config.get("external_speaker_embedding_file", speaker_file)) self.num_speakers = self.speaker_manager.num_speakers - self.speaker_embedding_dim = self.speaker_manager.x_vector_dim + self.d_vector_dim = self.speaker_manager.d_vector_dim def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None: """Load the TTS model. @@ -135,7 +135,7 @@ class Synthesizer(object): self.input_size, num_speakers=self.num_speakers, c=self.tts_config, - speaker_embedding_dim=self.speaker_embedding_dim, + d_vector_dim=self.d_vector_dim, ) self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: @@ -197,9 +197,9 @@ class Synthesizer(object): print(sens) if self.tts_speakers_file: - # get the speaker embedding from the saved x_vectors. + # get the speaker embedding from the saved d_vectors. if speaker_idx and isinstance(speaker_idx, str): - speaker_embedding = self.speaker_manager.get_x_vectors_by_speaker(speaker_idx)[0] + speaker_embedding = self.speaker_manager.get_d_vectors_by_speaker(speaker_idx)[0] elif not speaker_idx and not speaker_wav: raise ValueError( " [!] Look like you use a multi-speaker model. " @@ -214,9 +214,9 @@ class Synthesizer(object): "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. " ) - # compute a new x_vector from the given clip. + # compute a new d_vector from the given clip. if speaker_wav is not None: - speaker_embedding = self.speaker_manager.compute_x_vector_from_clip(speaker_wav) + speaker_embedding = self.speaker_manager.compute_d_vector_from_clip(speaker_wav) use_gl = self.vocoder_model is None @@ -232,7 +232,7 @@ class Synthesizer(object): style_wav=style_wav, enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars, use_griffin_lim=use_gl, - x_vector=speaker_embedding, + d_vector=speaker_embedding, ) waveform = outputs["wav"] mel_postnet_spec = outputs["model_outputs"] diff --git a/tests/test_extract_tts_spectrograms.py b/tests/test_extract_tts_spectrograms.py index ddc7e4da..d16167ed 100644 --- a/tests/test_extract_tts_spectrograms.py +++ b/tests/test_extract_tts_spectrograms.py @@ -22,7 +22,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): c = load_config(config_path) # create model num_chars = len(phonemes if c.use_phonemes else symbols) - model = setup_model(num_chars, 1, c, speaker_embedding_dim=None) + model = setup_model(num_chars, 1, c, d_vector_dim=None) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test @@ -41,7 +41,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): c = load_config(config_path) # create model num_chars = len(phonemes if c.use_phonemes else symbols) - model = setup_model(num_chars, 1, c, speaker_embedding_dim=None) + model = setup_model(num_chars, 1, c, d_vector_dim=None) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test @@ -60,7 +60,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): c = load_config(config_path) # create model num_chars = len(phonemes if c.use_phonemes else symbols) - model = setup_model(num_chars, 1, c, speaker_embedding_dim=None) + model = setup_model(num_chars, 1, c, d_vector_dim=None) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test diff --git a/tests/test_speaker_manager.py b/tests/test_speaker_manager.py index f80e56fc..a695fe61 100644 --- a/tests/test_speaker_manager.py +++ b/tests/test_speaker_manager.py @@ -15,11 +15,11 @@ encoder_config_path = os.path.join(get_tests_input_path(), "test_speaker_encoder encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth.tar") sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav") sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav") -x_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json") +d_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json") class SpeakerManagerTest(unittest.TestCase): - """Test SpeakerManager for loading embedding files and computing x_vectors from waveforms""" + """Test SpeakerManager for loading embedding files and computing d_vectors from waveforms""" @staticmethod def test_speaker_embedding(): @@ -38,38 +38,38 @@ class SpeakerManagerTest(unittest.TestCase): # load a sample audio and compute embedding waveform = ap.load_wav(sample_wav_path) mel = ap.melspectrogram(waveform) - x_vector = manager.compute_x_vector(mel.T) - assert x_vector.shape[1] == 256 + d_vector = manager.compute_d_vector(mel.T) + assert d_vector.shape[1] == 256 - # compute x_vector directly from an input file - x_vector = manager.compute_x_vector_from_clip(sample_wav_path) - x_vector2 = manager.compute_x_vector_from_clip(sample_wav_path) - x_vector = torch.FloatTensor(x_vector) - x_vector2 = torch.FloatTensor(x_vector2) - assert x_vector.shape[0] == 256 - assert (x_vector - x_vector2).sum() == 0.0 + # compute d_vector directly from an input file + d_vector = manager.compute_d_vector_from_clip(sample_wav_path) + d_vector2 = manager.compute_d_vector_from_clip(sample_wav_path) + d_vector = torch.FloatTensor(d_vector) + d_vector2 = torch.FloatTensor(d_vector2) + assert d_vector.shape[0] == 256 + assert (d_vector - d_vector2).sum() == 0.0 - # compute x_vector from a list of wav files. - x_vector3 = manager.compute_x_vector_from_clip([sample_wav_path, sample_wav_path2]) - x_vector3 = torch.FloatTensor(x_vector3) - assert x_vector3.shape[0] == 256 - assert (x_vector - x_vector3).sum() != 0.0 + # compute d_vector from a list of wav files. + d_vector3 = manager.compute_d_vector_from_clip([sample_wav_path, sample_wav_path2]) + d_vector3 = torch.FloatTensor(d_vector3) + assert d_vector3.shape[0] == 256 + assert (d_vector - d_vector3).sum() != 0.0 # remove dummy model os.remove(encoder_model_path) @staticmethod def test_speakers_file_processing(): - manager = SpeakerManager(x_vectors_file_path=x_vectors_file_path) + manager = SpeakerManager(d_vectors_file_path=d_vectors_file_path) print(manager.num_speakers) - print(manager.x_vector_dim) + print(manager.d_vector_dim) print(manager.clip_ids) - x_vector = manager.get_x_vector_by_clip(manager.clip_ids[0]) - assert len(x_vector) == 256 - x_vectors = manager.get_x_vectors_by_speaker(manager.speaker_ids[0]) - assert len(x_vectors[0]) == 256 - x_vector1 = manager.get_mean_x_vector(manager.speaker_ids[0], num_samples=2, randomize=True) - assert len(x_vector1) == 256 - x_vector2 = manager.get_mean_x_vector(manager.speaker_ids[0], num_samples=2, randomize=False) - assert len(x_vector2) == 256 - assert np.sum(np.array(x_vector1) - np.array(x_vector2)) != 0 + d_vector = manager.get_d_vector_by_clip(manager.clip_ids[0]) + assert len(d_vector) == 256 + d_vectors = manager.get_d_vectors_by_speaker(manager.speaker_ids[0]) + assert len(d_vectors[0]) == 256 + d_vector1 = manager.get_mean_d_vector(manager.speaker_ids[0], num_samples=2, randomize=True) + assert len(d_vector1) == 256 + d_vector2 = manager.get_mean_d_vector(manager.speaker_ids[0], num_samples=2, randomize=False) + assert len(d_vector2) == 256 + assert np.sum(np.array(d_vector1) - np.array(d_vector2)) != 0 diff --git a/tests/tts_tests/test_speedy_speech_layers.py b/tests/tts_tests/test_speedy_speech_layers.py index 66339a82..7c4f0adf 100644 --- a/tests/tts_tests/test_speedy_speech_layers.py +++ b/tests/tts_tests/test_speedy_speech_layers.py @@ -57,7 +57,7 @@ def test_speedy_speech(): # with speaker embedding model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128, num_speakers=10, c_in_channels=256).to(device) model.forward( - x_dummy, x_lengths, y_lengths, durations, cond_input={"x_vectors": torch.randint(0, 10, (B,)).to(device)} + x_dummy, x_lengths, y_lengths, durations, cond_input={"d_vectors": torch.randint(0, 10, (B,)).to(device)} ) o_de = outputs["model_outputs"] attn = outputs["alignments"] @@ -71,7 +71,7 @@ def test_speedy_speech(): model = SpeedySpeech( num_chars, out_channels=80, hidden_channels=128, num_speakers=10, external_c=True, c_in_channels=256 ).to(device) - model.forward(x_dummy, x_lengths, y_lengths, durations, cond_input={"x_vectors": torch.rand((B, 256)).to(device)}) + model.forward(x_dummy, x_lengths, y_lengths, durations, cond_input={"d_vectors": torch.rand((B, 256)).to(device)}) o_de = outputs["model_outputs"] attn = outputs["alignments"] o_dr = outputs["durations_log"] diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index 0933ec70..b77f7cc5 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -95,7 +95,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, speaker_embedding_dim=55).to(device) + model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 @@ -105,7 +105,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_ids} + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_ids} ) assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 @@ -259,7 +259,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, speaker_embedding_dim=55, use_gst=True, gst=c.gst).to( + model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55, use_gst=True, gst=c.gst).to( device ) model.train() @@ -271,7 +271,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings} + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_embeddings} ) assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py index 86de5d16..31682d7a 100644 --- a/tests/tts_tests/test_tacotron_model.py +++ b/tests/tts_tests/test_tacotron_model.py @@ -116,7 +116,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): decoder_output_dim=c.audio["num_mels"], r=c.r, memory_size=c.memory_size, - speaker_embedding_dim=55, + d_vector_dim=55, ).to( device ) # FIXME: missing num_speakers parameter to Tacotron ctor @@ -130,7 +130,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings} + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_embeddings} ) optimizer.zero_grad() loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) @@ -305,7 +305,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): gst=c.gst, r=c.r, memory_size=c.memory_size, - speaker_embedding_dim=55, + d_vector_dim=55, ).to( device ) # FIXME: missing num_speakers parameter to Tacotron ctor @@ -319,7 +319,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings} + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_embeddings} ) optimizer.zero_grad() loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) From a605dd3d08b74b8ca145afbedb222560ec5ff686 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 3 Jun 2021 11:46:53 +0200 Subject: [PATCH 064/258] Compute d_vectors and speaker_ids separately in TTSDataset --- TTS/bin/extract_tts_spectrograms.py | 19 ++++---------- TTS/trainer.py | 40 +++++++++++------------------ TTS/tts/datasets/TTSDataset.py | 35 +++++++++++++++++-------- 3 files changed, 44 insertions(+), 50 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 5137d48a..c4def748 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -39,7 +39,8 @@ def setup_loader(ap, r, verbose=False): enable_eos_bos=c.enable_eos_bos_chars, use_noise_augment=False, verbose=verbose, - speaker_mapping=speaker_manager.speaker_ids + speaker_id_mapping=speaker_manager.speaker_ids, + d_vector_mapping=speaker_manager.d_vectors if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None, ) @@ -84,22 +85,12 @@ def format_data(data): mel_input = data[4] mel_lengths = data[5] item_idx = data[7] - attn_mask = data[9] + d_vectors = data[8] + speaker_ids = data[9] + attn_mask = data[10] avg_text_length = torch.mean(text_lengths.float()) avg_spec_length = torch.mean(mel_lengths.float()) - if c.use_speaker_embedding: - if c.use_external_speaker_embedding_file: - speaker_embeddings = data[8] - speaker_ids = None - else: - speaker_ids = [speaker_manager.speaker_ids[speaker_name] for speaker_name in speaker_names] - speaker_ids = torch.LongTensor(speaker_ids) - speaker_embeddings = None - else: - speaker_embeddings = None - speaker_ids = None - # dispatch data to GPU if use_cuda: text_input = text_input.cuda(non_blocking=True) diff --git a/TTS/trainer.py b/TTS/trainer.py index 55560624..7136e023 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -267,7 +267,8 @@ class TrainerTTS: is_eval: bool, data_items: List, verbose: bool, - speaker_mapping: Union[Dict, List], + speaker_ids: Union[Dict, List], + d_vectors: Union[Dict, List] ) -> DataLoader: if is_eval and not self.config.run_eval: loader = None @@ -289,9 +290,10 @@ class TrainerTTS: enable_eos_bos=self.config.enable_eos_bos_chars, use_noise_augment=not is_eval, verbose=verbose, - speaker_mapping=speaker_mapping - if self.config.use_speaker_embedding and self.config.use_external_speaker_embedding_file - else None, + speaker_id_mapping=speaker_ids + if self.config.use_speaker_embedding else None, + d_vector_mapping=d_vectors + if self.config.use_speaker_embedding and self.config.use_external_speaker_embedding_file else None, ) if self.config.use_phonemes and self.config.compute_input_seq_cache: @@ -313,14 +315,14 @@ class TrainerTTS: return loader def get_train_dataloader( - self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_mapping: Union[List, Dict] + self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_ids: Dict, d_vectors: Dict ) -> DataLoader: - return self._get_loader(r, ap, False, data_items, verbose, speaker_mapping) + return self._get_loader(r, ap, False, data_items, verbose, speaker_ids, d_vectors) def get_eval_dataloder( - self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_mapping: Union[List, Dict] + self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_ids: Dict, d_vectors: Dict ) -> DataLoader: - return self._get_loader(r, ap, True, data_items, verbose, speaker_mapping) + return self._get_loader(r, ap, True, data_items, verbose, speaker_ids, d_vectors) def format_batch(self, batch: List) -> Dict: # setup input batch @@ -332,24 +334,12 @@ class TrainerTTS: mel_lengths = batch[5] stop_targets = batch[6] item_idx = batch[7] - speaker_embeddings = batch[8] - attn_mask = batch[9] + d_vectors = batch[8] + speaker_ids = batch[9] + attn_mask = batch[10] max_text_length = torch.max(text_lengths.float()) max_spec_length = torch.max(mel_lengths.float()) - # convert speaker names to ids - if self.config.use_speaker_embedding: - if self.config.use_external_speaker_embedding_file: - speaker_embeddings = batch[8] - speaker_ids = None - else: - speaker_ids = [self.speaker_manager.speaker_ids[speaker_name] for speaker_name in speaker_names] - speaker_ids = torch.LongTensor(speaker_ids) - speaker_embeddings = None - else: - speaker_embeddings = None - speaker_ids = None - # compute durations from attention masks durations = None if attn_mask is not None: @@ -640,11 +630,11 @@ class TrainerTTS: # define data loaders self.train_loader = self.get_train_dataloader( - self.config.r, self.ap, self.data_train, verbose=True, speaker_mapping=self.speaker_manager.speaker_ids + self.config.r, self.ap, self.data_train, verbose=True, speaker_ids=self.speaker_manager.speaker_ids, d_vectors=self.speaker_manager.d_vectors ) self.eval_loader = ( self.get_eval_dataloder( - self.config.r, self.ap, self.data_train, verbose=True, speaker_mapping=self.speaker_manager.speaker_ids + self.config.r, self.ap, self.data_train, verbose=True, speaker_ids=self.speaker_manager.speaker_ids, d_vectors=self.speaker_manager.d_vectors ) if self.config.run_eval else None diff --git a/TTS/tts/datasets/TTSDataset.py b/TTS/tts/datasets/TTSDataset.py index 76f82c97..2522b55a 100644 --- a/TTS/tts/datasets/TTSDataset.py +++ b/TTS/tts/datasets/TTSDataset.py @@ -29,7 +29,8 @@ class TTSDataset(Dataset): phoneme_cache_path=None, phoneme_language="en-us", enable_eos_bos=False, - speaker_mapping=None, + speaker_id_mapping=None, + d_vector_mapping=None, use_noise_augment=False, verbose=False, ): @@ -51,6 +52,8 @@ class TTSDataset(Dataset): phoneme_language (str): one the languages from https://github.com/bootphon/phonemizer#languages enable_eos_bos (bool): enable end of sentence and beginning of sentences characters. + speaker_id_mapping (dict): list of speaker ids to map speaker names to numerical ids. + d_vector_mapping (dict): dictionary of d-vectors that maps each audio file to a pre-computed d-vector. use_noise_augment (bool): enable adding random noise to wav for augmentation. verbose (bool): print diagnostic information. """ @@ -70,7 +73,8 @@ class TTSDataset(Dataset): self.phoneme_cache_path = phoneme_cache_path self.phoneme_language = phoneme_language self.enable_eos_bos = enable_eos_bos - self.speaker_mapping = speaker_mapping + self.speaker_id_mapping = speaker_id_mapping + self.d_vector_mapping = d_vector_mapping self.use_noise_augment = use_noise_augment self.verbose = verbose self.input_seq_computed = False @@ -293,13 +297,18 @@ class TTSDataset(Dataset): item_idxs = [batch[idx]["item_idx"] for idx in ids_sorted_decreasing] text = [batch[idx]["text"] for idx in ids_sorted_decreasing] - speaker_name = [batch[idx]["speaker_name"] for idx in ids_sorted_decreasing] - # get speaker embeddings - if self.speaker_mapping is not None: + speaker_names = [batch[idx]["speaker_name"] for idx in ids_sorted_decreasing] + # get pre-computed d-vectors + if self.d_vector_mapping is not None: wav_files_names = [batch[idx]["wav_file_name"] for idx in ids_sorted_decreasing] - speaker_embedding = [self.speaker_mapping[w]["embedding"] for w in wav_files_names] + d_vectors = [self.speaker_mapping[w]["embedding"] for w in wav_files_names] else: - speaker_embedding = None + d_vectors = None + # get numerical speaker ids from speaker names + if self.speaker_id_mapping: + speaker_ids = [self.speaker_manager.speaker_ids[sn] for sn in speaker_names] + else: + speaker_ids = None # compute features mel = [self.ap.melspectrogram(w).astype("float32") for w in wav] @@ -327,8 +336,11 @@ class TTSDataset(Dataset): mel_lengths = torch.LongTensor(mel_lengths) stop_targets = torch.FloatTensor(stop_targets) - if speaker_embedding is not None: - speaker_embedding = torch.FloatTensor(speaker_embedding) + if d_vectors is not None: + d_vectors = torch.FloatTensor(d_vectors) + + if speaker_ids is not None: + speaker_ids = torch.LongTensor(speaker_ids) # compute linear spectrogram if self.compute_linear_spec: @@ -355,13 +367,14 @@ class TTSDataset(Dataset): return ( text, text_lenghts, - speaker_name, + speaker_names, linear, mel, mel_lengths, stop_targets, item_idxs, - speaker_embedding, + d_vectors, + speaker_ids, attns, ) From 147550c65fbf91ed62323c4a17bbebddd054d81f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 3 Jun 2021 13:05:54 +0200 Subject: [PATCH 065/258] make style and linter fixes --- TTS/bin/extract_tts_spectrograms.py | 1 - TTS/trainer.py | 23 +++++++++++++++++------ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index c4def748..79021bf1 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -81,7 +81,6 @@ def format_data(data): # setup input data text_input = data[0] text_lengths = data[1] - speaker_names = data[2] mel_input = data[4] mel_lengths = data[5] item_idx = data[7] diff --git a/TTS/trainer.py b/TTS/trainer.py index 7136e023..f837ce7f 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -268,7 +268,7 @@ class TrainerTTS: data_items: List, verbose: bool, speaker_ids: Union[Dict, List], - d_vectors: Union[Dict, List] + d_vectors: Union[Dict, List], ) -> DataLoader: if is_eval and not self.config.run_eval: loader = None @@ -290,10 +290,10 @@ class TrainerTTS: enable_eos_bos=self.config.enable_eos_bos_chars, use_noise_augment=not is_eval, verbose=verbose, - speaker_id_mapping=speaker_ids - if self.config.use_speaker_embedding else None, + speaker_id_mapping=speaker_ids if self.config.use_speaker_embedding else None, d_vector_mapping=d_vectors - if self.config.use_speaker_embedding and self.config.use_external_speaker_embedding_file else None, + if self.config.use_speaker_embedding and self.config.use_external_speaker_embedding_file + else None, ) if self.config.use_phonemes and self.config.compute_input_seq_cache: @@ -383,6 +383,7 @@ class TrainerTTS: return { "text_input": text_input, "text_lengths": text_lengths, + "speaker_names": speaker_names, "mel_input": mel_input, "mel_lengths": mel_lengths, "linear_input": linear_input, @@ -630,11 +631,21 @@ class TrainerTTS: # define data loaders self.train_loader = self.get_train_dataloader( - self.config.r, self.ap, self.data_train, verbose=True, speaker_ids=self.speaker_manager.speaker_ids, d_vectors=self.speaker_manager.d_vectors + self.config.r, + self.ap, + self.data_train, + verbose=True, + speaker_ids=self.speaker_manager.speaker_ids, + d_vectors=self.speaker_manager.d_vectors, ) self.eval_loader = ( self.get_eval_dataloder( - self.config.r, self.ap, self.data_train, verbose=True, speaker_ids=self.speaker_manager.speaker_ids, d_vectors=self.speaker_manager.d_vectors + self.config.r, + self.ap, + self.data_train, + verbose=True, + speaker_ids=self.speaker_manager.speaker_ids, + d_vectors=self.speaker_manager.d_vectors, ) if self.config.run_eval else None From d2fd6a34a1709086f3c58dd39f8d06011559a6bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sat, 5 Jun 2021 11:46:53 +0200 Subject: [PATCH 066/258] use get_speaker_manager in Trainer and save speakers.json file when needed --- TTS/trainer.py | 22 ++-------------------- TTS/tts/utils/speakers.py | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 24 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index f837ce7f..564c4c26 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -21,7 +21,7 @@ from TTS.tts.datasets import TTSDataset, load_meta_data from TTS.tts.layers import setup_loss from TTS.tts.models import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint -from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.symbols import make_symbols from TTS.tts.utils.visual import plot_alignment, plot_spectrogram @@ -186,25 +186,7 @@ class TrainerTTS: def get_speaker_manager( config: Coqpit, restore_path: str = "", out_path: str = "", data_train: List = None ) -> SpeakerManager: - speaker_manager = SpeakerManager() - if restore_path: - speakers_file = os.path.join(os.path.dirname(restore_path), "speaker.json") - if not os.path.exists(speakers_file): - print( - "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" - ) - speakers_file = config.external_speaker_embedding_file - - if config.use_external_speaker_embedding_file: - speaker_manager.load_d_vectors_file(speakers_file) - else: - speaker_manager.load_ids_file(speakers_file) - elif config.use_external_speaker_embedding_file and config.external_speaker_embedding_file: - speaker_manager.load_d_vectors_file(config.external_speaker_embedding_file) - else: - speaker_manager.parse_speakers_from_items(data_train) - file_path = os.path.join(out_path, "speakers.json") - speaker_manager.save_ids_file(file_path) + speaker_manager = get_speaker_manager(config, restore_path, data_train, out_path) return speaker_manager @staticmethod diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 546d483d..0f43bf97 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -34,16 +34,16 @@ def save_speaker_mapping(out_path, speaker_mapping): json.dump(speaker_mapping, f, indent=4) -def get_speaker_manager(c, args, meta_data_train): +def get_speaker_manager(c, restore_path, meta_data_train, out_path=None): """Inititalize and return a `SpeakerManager` based on config values""" speaker_manager = SpeakerManager() if c.use_speaker_embedding: speaker_manager.set_speaker_ids_from_data(meta_data_train) - if args.restore_path: + if restore_path: # restoring speaker manager from a previous run. if c.use_external_speaker_embedding_file: # restore speaker manager with the embedding file - speakers_file = os.path.dirname(args.restore_path) + speakers_file = os.path.dirname(restore_path) if not os.path.exists(speakers_file): print( "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" @@ -55,7 +55,7 @@ def get_speaker_manager(c, args, meta_data_train): speaker_manager.load_d_vectors_file(c.external_speaker_embedding_file) speaker_manager.set_d_vectors_from_file(speakers_file) elif not c.use_external_speaker_embedding_file: # restor speaker manager with speaker ID file. - speakers_file = os.path.dirname(args.restore_path) + speakers_file = os.path.dirname(restore_path) speaker_ids_from_data = speaker_manager.speaker_ids speaker_manager.set_speaker_ids_from_file(speakers_file) assert all( @@ -73,6 +73,14 @@ def get_speaker_manager(c, args, meta_data_train): speaker_manager.num_speakers, ", ".join(speaker_manager.speaker_ids) ) ) + # save file if path is defined + if out_path: + out_file_path = os.path.join(out_path, "speaker.json") + print(" > Saving `speaker.json` to {out_file_path}.") + if c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: + speaker_manager.save_d_vectors_to_file(out_file_path) + else: + speaker_manager.save_speaker_ids_to_file(out_file_path) return speaker_manager From 3f96491168ce0eba091eac290a1bae5d95fde893 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sat, 5 Jun 2021 11:48:16 +0200 Subject: [PATCH 067/258] reduce multiband melgan test model size --- tests/vocoder_tests/test_fullband_melgan_train.py | 1 - tests/vocoder_tests/test_multiband_melgan_train.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py index fbce03eb..f93a5318 100644 --- a/tests/vocoder_tests/test_fullband_melgan_train.py +++ b/tests/vocoder_tests/test_fullband_melgan_train.py @@ -20,7 +20,6 @@ config = FullbandMelganConfig( eval_split_size=1, print_step=1, print_eval=True, - discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, data_path="tests/data/ljspeech", discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, output_path=output_path, diff --git a/tests/vocoder_tests/test_multiband_melgan_train.py b/tests/vocoder_tests/test_multiband_melgan_train.py index 081fb40e..4f12782f 100644 --- a/tests/vocoder_tests/test_multiband_melgan_train.py +++ b/tests/vocoder_tests/test_multiband_melgan_train.py @@ -21,6 +21,7 @@ config = MultibandMelganConfig( print_step=1, print_eval=True, data_path="tests/data/ljspeech", + discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, output_path=output_path, ) config.audio.do_trim_silence = True From cbb52b3d8391d899ae53a3921238ddad256a04cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 6 Jun 2021 13:21:13 +0200 Subject: [PATCH 068/258] fix speaker_manager init --- TTS/trainer.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index 564c4c26..9fe2f108 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -101,9 +101,7 @@ class TrainerTTS: self.data_train, self.data_eval = load_meta_data(self.config.datasets) # default speaker manager - self.speaker_manager = self.get_speaker_manager( - self.config, args.restore_path, self.config.output_path, self.data_train - ) + self.speaker_manager = self.get_speaker_manager(self.config, args.restore_path, output_path, self.data_train) # init TTS model if model is not None: @@ -587,7 +585,7 @@ class TrainerTTS: speaker_id = 0 if self.config.use_speaker_embedding else None # setup d_vector d_vector = ( - self.speaker_manager.get_d_vectors_by_speaker(self.speaker_manager.speaker_ids[0]) + self.speaker_manager.get_d_vectors_by_speaker(self.speaker_manager.speaker_names[0]) if self.config.use_external_speaker_embedding_file and self.config.use_speaker_embedding else None ) From 0206bb847bf7b0a569c6e9099668c7433e70811f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 6 Jun 2021 13:38:01 +0200 Subject: [PATCH 069/258] add max_decoder_steps argument to tacotron models --- TTS/tts/configs/tacotron_config.py | 3 +++ TTS/tts/layers/tacotron/tacotron.py | 4 +++- TTS/tts/layers/tacotron/tacotron2.py | 4 +++- TTS/tts/models/__init__.py | 2 ++ TTS/tts/models/tacotron.py | 4 ++++ TTS/tts/models/tacotron2.py | 4 ++++ tests/tts_tests/test_tacotron2_train.py | 1 + tests/tts_tests/test_tacotron_layers.py | 1 + tests/tts_tests/test_tacotron_train.py | 2 ++ 9 files changed, 23 insertions(+), 2 deletions(-) diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index b197eaf6..2b67901c 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -46,6 +46,8 @@ class TacotronConfig(BaseTTSConfig): stopnet_pos_weight (float): Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with datasets with longer sentences. Defaults to 10. + max_decoder_steps (int): + Max number of steps allowed for the decoder. Defaults to 10000. separate_stopnet (bool): Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True. attention_type (str): @@ -137,6 +139,7 @@ class TacotronConfig(BaseTTSConfig): stopnet: bool = True separate_stopnet: bool = True stopnet_pos_weight: float = 10.0 + max_decoder_steps: int = 10000 # attention layers attention_type: str = "original" diff --git a/TTS/tts/layers/tacotron/tacotron.py b/TTS/tts/layers/tacotron/tacotron.py index 2f94db88..a6579171 100644 --- a/TTS/tts/layers/tacotron/tacotron.py +++ b/TTS/tts/layers/tacotron/tacotron.py @@ -267,6 +267,7 @@ class Decoder(nn.Module): attn_K (int): number of attention heads for GravesAttention. separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow. d_vector_dim (int): size of speaker embedding vector, for multi-speaker training. + max_decoder_steps (int): Maximum number of steps allowed for the decoder. Defaults to 500. """ # Pylint gets confused by PyTorch conventions here @@ -289,12 +290,13 @@ class Decoder(nn.Module): location_attn, attn_K, separate_stopnet, + max_decoder_steps, ): super().__init__() self.r_init = r self.r = r self.in_channels = in_channels - self.max_decoder_steps = 500 + self.max_decoder_steps = max_decoder_steps self.use_memory_queue = memory_size > 0 self.memory_size = memory_size if memory_size > 0 else r self.frame_channels = frame_channels diff --git a/TTS/tts/layers/tacotron/tacotron2.py b/TTS/tts/layers/tacotron/tacotron2.py index aeca8953..61fe9f4b 100644 --- a/TTS/tts/layers/tacotron/tacotron2.py +++ b/TTS/tts/layers/tacotron/tacotron2.py @@ -135,6 +135,7 @@ class Decoder(nn.Module): location_attn (bool): if true, use location sensitive attention. attn_K (int): number of attention heads for GravesAttention. separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow. + max_decoder_steps (int): Maximum number of steps allowed for the decoder. Defaults to 10000. """ # Pylint gets confused by PyTorch conventions here @@ -155,6 +156,7 @@ class Decoder(nn.Module): location_attn, attn_K, separate_stopnet, + max_decoder_steps, ): super().__init__() self.frame_channels = frame_channels @@ -162,7 +164,7 @@ class Decoder(nn.Module): self.r = r self.encoder_embedding_dim = in_channels self.separate_stopnet = separate_stopnet - self.max_decoder_steps = 1000 + self.max_decoder_steps = max_decoder_steps self.stop_threshold = 0.5 # model dimensions diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py index 026f5c85..2a951267 100644 --- a/TTS/tts/models/__init__.py +++ b/TTS/tts/models/__init__.py @@ -30,6 +30,7 @@ def setup_model(num_chars, num_speakers, c, d_vector_dim=None): double_decoder_consistency=c.double_decoder_consistency, ddc_r=c.ddc_r, d_vector_dim=d_vector_dim, + max_decoder_steps=c.max_decoder_steps, ) elif c.model.lower() == "tacotron2": model = MyModel( @@ -56,6 +57,7 @@ def setup_model(num_chars, num_speakers, c, d_vector_dim=None): double_decoder_consistency=c.double_decoder_consistency, ddc_r=c.ddc_r, d_vector_dim=d_vector_dim, + max_decoder_steps=c.max_decoder_steps, ) elif c.model.lower() == "glow_tts": model = MyModel( diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 123b69a7..5eeeedaa 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -49,6 +49,7 @@ class Tacotron(TacotronAbstract): output frames to the prenet. gradual_trainin (List): Gradual training schedule. If None or `[]`, no gradual training is used. Defaults to `[]`. + max_decoder_steps (int): Maximum number of steps allowed for the decoder. Defaults to 10000. """ def __init__( @@ -80,6 +81,7 @@ class Tacotron(TacotronAbstract): gst=None, memory_size=5, gradual_training=None, + max_decoder_steps=500, ): super().__init__( num_chars, @@ -143,6 +145,7 @@ class Tacotron(TacotronAbstract): location_attn, attn_K, separate_stopnet, + max_decoder_steps, ) self.postnet = PostCBHG(decoder_output_dim) self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, postnet_output_dim) @@ -180,6 +183,7 @@ class Tacotron(TacotronAbstract): location_attn, attn_K, separate_stopnet, + max_decoder_steps, ) def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, cond_input=None): diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 4628c64e..b6da4e44 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -47,6 +47,7 @@ class Tacotron2(TacotronAbstract): gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. gradual_training (List): Gradual training schedule. If None or `[]`, no gradual training is used. Defaults to `[]`. + max_decoder_steps (int): Maximum number of steps allowed for the decoder. Defaults to 10000. """ def __init__( @@ -77,6 +78,7 @@ class Tacotron2(TacotronAbstract): use_gst=False, gst=None, gradual_training=None, + max_decoder_steps=500, ): super().__init__( num_chars, @@ -138,6 +140,7 @@ class Tacotron2(TacotronAbstract): location_attn, attn_K, separate_stopnet, + max_decoder_steps, ) self.postnet = Postnet(self.postnet_output_dim) @@ -174,6 +177,7 @@ class Tacotron2(TacotronAbstract): location_attn, attn_K, separate_stopnet, + max_decoder_steps, ) @staticmethod diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py index 0d9a67a5..face77ae 100644 --- a/tests/tts_tests/test_tacotron2_train.py +++ b/tests/tts_tests/test_tacotron2_train.py @@ -24,6 +24,7 @@ config = Tacotron2Config( epochs=1, print_step=1, print_eval=True, + max_decoder_steps=50, ) config.audio.do_trim_silence = True config.audio.trim_db = 60 diff --git a/tests/tts_tests/test_tacotron_layers.py b/tests/tts_tests/test_tacotron_layers.py index 6c4b76b5..783be0db 100644 --- a/tests/tts_tests/test_tacotron_layers.py +++ b/tests/tts_tests/test_tacotron_layers.py @@ -61,6 +61,7 @@ class DecoderTests(unittest.TestCase): forward_attn_mask=True, location_attn=True, separate_stopnet=True, + max_decoder_steps=50, ) dummy_input = T.rand(4, 8, 256) dummy_memory = T.rand(4, 2, 80) diff --git a/tests/tts_tests/test_tacotron_train.py b/tests/tts_tests/test_tacotron_train.py index 52560715..9443d73a 100644 --- a/tests/tts_tests/test_tacotron_train.py +++ b/tests/tts_tests/test_tacotron_train.py @@ -23,6 +23,8 @@ config = TacotronConfig( epochs=1, print_step=1, print_eval=True, + r=5, + max_decoder_steps=50, ) config.audio.do_trim_silence = True config.audio.trim_db = 60 From a87c8864975403388e71ad9d041f745a0cc285c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 6 Jun 2021 13:39:52 +0200 Subject: [PATCH 070/258] refactor and fix multi-speaker training in Trainer and Tacotron models --- TTS/__init__.py | 1 - TTS/tts/datasets/TTSDataset.py | 4 +- TTS/tts/datasets/formatters.py | 15 + TTS/tts/models/tacotron.py | 7 +- TTS/tts/models/tacotron2.py | 10 +- TTS/tts/utils/speakers.py | 33 +- tests/data/ljspeech/speakers.json | 2612 +++++++++++++++++ tests/data_tests/__init__.py | 0 tests/inference_tests/__init__.py | 0 tests/test_speaker_manager.py | 6 +- tests/text_tests/__init__.py | 0 .../test_tacotron2_d-vectors_train.py | 57 + .../test_tacotron2_speaker_emb_train.py | 55 + 13 files changed, 2779 insertions(+), 21 deletions(-) create mode 100644 tests/data/ljspeech/speakers.json create mode 100644 tests/data_tests/__init__.py create mode 100644 tests/inference_tests/__init__.py create mode 100644 tests/text_tests/__init__.py create mode 100644 tests/tts_tests/test_tacotron2_d-vectors_train.py create mode 100644 tests/tts_tests/test_tacotron2_speaker_emb_train.py diff --git a/TTS/__init__.py b/TTS/__init__.py index da35faf8..5162d4ec 100644 --- a/TTS/__init__.py +++ b/TTS/__init__.py @@ -1,6 +1,5 @@ import os - with open(os.path.join(os.path.dirname(__file__), "VERSION")) as f: version = f.read().strip() diff --git a/TTS/tts/datasets/TTSDataset.py b/TTS/tts/datasets/TTSDataset.py index 2522b55a..d0fbb553 100644 --- a/TTS/tts/datasets/TTSDataset.py +++ b/TTS/tts/datasets/TTSDataset.py @@ -301,12 +301,12 @@ class TTSDataset(Dataset): # get pre-computed d-vectors if self.d_vector_mapping is not None: wav_files_names = [batch[idx]["wav_file_name"] for idx in ids_sorted_decreasing] - d_vectors = [self.speaker_mapping[w]["embedding"] for w in wav_files_names] + d_vectors = [self.d_vector_mapping[w]["embedding"] for w in wav_files_names] else: d_vectors = None # get numerical speaker ids from speaker names if self.speaker_id_mapping: - speaker_ids = [self.speaker_manager.speaker_ids[sn] for sn in speaker_names] + speaker_ids = [self.speaker_id_mapping[sn] for sn in speaker_names] else: speaker_ids = None # compute features diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 815a1b1d..3cb37168 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -107,6 +107,21 @@ def ljspeech(root_path, meta_file): return items +def ljspeech_test(root_path, meta_file): + """Normalizes the LJSpeech meta data file for TTS testing + https://keithito.com/LJ-Speech-Dataset/""" + txt_file = os.path.join(root_path, meta_file) + items = [] + speaker_name = "ljspeech" + with open(txt_file, "r", encoding="utf-8") as ttf: + for idx, line in enumerate(ttf): + cols = line.split("|") + wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav") + text = cols[1] + items.append([text, wav_file, f"ljspeech-{idx}"]) + return items + + def sam_accenture(root_path, meta_file): """Normalizes the sam-accenture meta data file to TTS format https://github.com/Sam-Accenture-Non-Binary-Voice/non-binary-voice-files""" diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 5eeeedaa..3ee70431 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -262,7 +262,12 @@ class Tacotron(TacotronAbstract): if self.num_speakers > 1: if not self.use_d_vectors: # B x 1 x speaker_embed_dim - embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[:, None] + embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"]) + # reshape embedded_speakers + if embedded_speakers.ndim == 1: + embedded_speakers = embedded_speakers[None, None, :] + elif embedded_speakers.ndim == 2: + embedded_speakers = embedded_speakers[None, :] else: # B x 1 x speaker_embed_dim embedded_speakers = torch.unsqueeze(cond_input["d_vectors"], 1) diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index b6da4e44..f6e59542 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -261,9 +261,13 @@ class Tacotron2(TacotronAbstract): # B x gst_dim encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["d_vectors"]) if self.num_speakers > 1: - if not self.embeddings_per_sample: - x_vector = self.speaker_embedding(cond_input['speaker_ids'])[:, None] - x_vector = torch.unsqueeze(x_vector, 0).transpose(1, 2) + if not self.use_d_vectors: + embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[None] + # reshape embedded_speakers + if embedded_speakers.ndim == 1: + embedded_speakers = embedded_speakers[None, None, :] + elif embedded_speakers.ndim == 2: + embedded_speakers = embedded_speakers[None, :] else: embedded_speakers = cond_input["d_vectors"] diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 0f43bf97..01e26c6b 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -11,9 +11,16 @@ from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.utils.audio import AudioProcessor -def make_speakers_json_path(out_path): - """Returns conventional speakers.json location.""" - return os.path.join(out_path, "speakers.json") +def _set_file_path(path): + """Find the speakers.json under the given path or the above it. + Intended to band aid the different paths returned in restored and continued training.""" + path_restore = os.path.join(os.path.dirname(path), "speakers.json") + path_continue = os.path.join(path, "speakers.json") + if os.path.exists(path_restore): + return path_restore + if os.path.exists(path_continue): + return path_continue + raise FileNotFoundError(f" [!] `speakers.json` not found in {path}") def load_speaker_mapping(out_path): @@ -21,7 +28,7 @@ def load_speaker_mapping(out_path): if os.path.splitext(out_path)[1] == ".json": json_file = out_path else: - json_file = make_speakers_json_path(out_path) + json_file = _set_file_path(out_path) with open(json_file) as f: return json.load(f) @@ -29,7 +36,7 @@ def load_speaker_mapping(out_path): def save_speaker_mapping(out_path, speaker_mapping): """Saves speaker mapping if not yet present.""" if out_path is not None: - speakers_json_path = make_speakers_json_path(out_path) + speakers_json_path = _set_file_path(out_path) with open(speakers_json_path, "w") as f: json.dump(speaker_mapping, f, indent=4) @@ -40,10 +47,10 @@ def get_speaker_manager(c, restore_path, meta_data_train, out_path=None): if c.use_speaker_embedding: speaker_manager.set_speaker_ids_from_data(meta_data_train) if restore_path: + speakers_file = _set_file_path(restore_path) # restoring speaker manager from a previous run. if c.use_external_speaker_embedding_file: # restore speaker manager with the embedding file - speakers_file = os.path.dirname(restore_path) if not os.path.exists(speakers_file): print( "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" @@ -55,7 +62,6 @@ def get_speaker_manager(c, restore_path, meta_data_train, out_path=None): speaker_manager.load_d_vectors_file(c.external_speaker_embedding_file) speaker_manager.set_d_vectors_from_file(speakers_file) elif not c.use_external_speaker_embedding_file: # restor speaker manager with speaker ID file. - speakers_file = os.path.dirname(restore_path) speaker_ids_from_data = speaker_manager.speaker_ids speaker_manager.set_speaker_ids_from_file(speakers_file) assert all( @@ -75,8 +81,8 @@ def get_speaker_manager(c, restore_path, meta_data_train, out_path=None): ) # save file if path is defined if out_path: - out_file_path = os.path.join(out_path, "speaker.json") - print(" > Saving `speaker.json` to {out_file_path}.") + out_file_path = os.path.join(out_path, "speakers.json") + print(f" > Saving `speakers.json` to {out_file_path}.") if c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: speaker_manager.save_d_vectors_to_file(out_file_path) else: @@ -138,7 +144,7 @@ class SpeakerManager: self.speaker_encoder_ap = None if data_items: - self.speaker_ids, _ = self.parse_speakers_from_data(self.data_items) + self.speaker_ids, self.speaker_names, _ = self.parse_speakers_from_data(self.data_items) if d_vectors_file_path: self.set_d_vectors_from_file(d_vectors_file_path) @@ -163,6 +169,10 @@ class SpeakerManager: def num_speakers(self): return len(self.speaker_ids) + @property + def speaker_names(self): + return list(self.speaker_ids.keys()) + @property def d_vector_dim(self): """Dimensionality of d_vectors. If d_vectors are not loaded, returns zero.""" @@ -224,7 +234,8 @@ class SpeakerManager: file_path (str): Path to the target json file. """ self.d_vectors = self._load_json(file_path) - self.speaker_ids = list(set(sorted(x["name"] for x in self.d_vectors.values()))) + speakers = sorted({x["name"] for x in self.d_vectors.values()}) + self.speaker_ids = {name: i for i, name in enumerate(speakers)} self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys()))) def get_d_vector_by_clip(self, clip_idx: str) -> List: diff --git a/tests/data/ljspeech/speakers.json b/tests/data/ljspeech/speakers.json new file mode 100644 index 00000000..915cff73 --- /dev/null +++ b/tests/data/ljspeech/speakers.json @@ -0,0 +1,2612 @@ +{ + "LJ001-0001.wav": { + "name": "ljspeech-0", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0002.wav": { + "name": "ljspeech-1", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0003.wav": { + "name": "ljspeech-2", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0004.wav": { + "name": "ljspeech-3", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0005.wav": { + "name": "ljspeech-4", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0006.wav": { + "name": "ljspeech-5", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0007.wav": { + "name": "ljspeech-6", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0008.wav": { + "name": "ljspeech-7", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0009.wav": { + "name": "ljspeech-8", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0010.wav": { + "name": "ljspeech-9", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + } +} diff --git a/tests/data_tests/__init__.py b/tests/data_tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/inference_tests/__init__.py b/tests/inference_tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_speaker_manager.py b/tests/test_speaker_manager.py index a695fe61..baa50749 100644 --- a/tests/test_speaker_manager.py +++ b/tests/test_speaker_manager.py @@ -66,10 +66,10 @@ class SpeakerManagerTest(unittest.TestCase): print(manager.clip_ids) d_vector = manager.get_d_vector_by_clip(manager.clip_ids[0]) assert len(d_vector) == 256 - d_vectors = manager.get_d_vectors_by_speaker(manager.speaker_ids[0]) + d_vectors = manager.get_d_vectors_by_speaker(manager.speaker_names[0]) assert len(d_vectors[0]) == 256 - d_vector1 = manager.get_mean_d_vector(manager.speaker_ids[0], num_samples=2, randomize=True) + d_vector1 = manager.get_mean_d_vector(manager.speaker_names[0], num_samples=2, randomize=True) assert len(d_vector1) == 256 - d_vector2 = manager.get_mean_d_vector(manager.speaker_ids[0], num_samples=2, randomize=False) + d_vector2 = manager.get_mean_d_vector(manager.speaker_names[0], num_samples=2, randomize=False) assert len(d_vector2) == 256 assert np.sum(np.array(d_vector1) - np.array(d_vector2)) != 0 diff --git a/tests/text_tests/__init__.py b/tests/text_tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/tts_tests/test_tacotron2_d-vectors_train.py b/tests/tts_tests/test_tacotron2_d-vectors_train.py new file mode 100644 index 00000000..7fda7e09 --- /dev/null +++ b/tests/tts_tests/test_tacotron2_d-vectors_train.py @@ -0,0 +1,57 @@ +import glob +import os +import shutil + +from tests import get_device_id, get_tests_output_path, run_cli +from TTS.tts.configs import Tacotron2Config + +config_path = os.path.join(get_tests_output_path(), "test_model_config.json") +output_path = os.path.join(get_tests_output_path(), "train_outputs") + +config = Tacotron2Config( + r=5, + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_val_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + use_speaker_embedding=True, + use_external_speaker_embedding_file=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + external_speaker_embedding_file="tests/data/ljspeech/speakers.json", + max_decoder_steps=50, +) + +config.audio.do_trim_silence = True +config.audio.trim_db = 60 +config.save_json(config_path) + +# train the model for one epoch +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.name ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " +) +run_cli(command_train) + +# Find latest folder +continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + +# restore the model and continue training for one more epoch +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " +run_cli(command_train) +shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_tacotron2_speaker_emb_train.py b/tests/tts_tests/test_tacotron2_speaker_emb_train.py new file mode 100644 index 00000000..a242c724 --- /dev/null +++ b/tests/tts_tests/test_tacotron2_speaker_emb_train.py @@ -0,0 +1,55 @@ +import glob +import os +import shutil + +from tests import get_device_id, get_tests_output_path, run_cli +from TTS.tts.configs import Tacotron2Config + +config_path = os.path.join(get_tests_output_path(), "test_model_config.json") +output_path = os.path.join(get_tests_output_path(), "train_outputs") + +config = Tacotron2Config( + r=5, + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_val_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + use_speaker_embedding=True, + max_decoder_steps=50, +) + +config.audio.do_trim_silence = True +config.audio.trim_db = 60 +config.save_json(config_path) + +# train the model for one epoch +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.name ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " +) +run_cli(command_train) + +# Find latest folder +continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + +# restore the model and continue training for one more epoch +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " +run_cli(command_train) +shutil.rmtree(continue_path) From 7ea71c758651c46520a4d53df1f9690261884782 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 6 Jun 2021 13:41:45 +0200 Subject: [PATCH 071/258] use one testing sentence in tts tests --- tests/tts_tests/test_align_tts_train.py | 3 +++ tests/tts_tests/test_glow_tts_train.py | 3 +++ tests/tts_tests/test_speedy_speech_train.py | 3 +++ tests/tts_tests/test_tacotron2_train.py | 3 +++ tests/tts_tests/test_tacotron_train.py | 3 +++ 5 files changed, 15 insertions(+) diff --git a/tests/tts_tests/test_align_tts_train.py b/tests/tts_tests/test_align_tts_train.py index 3d802d5f..61d67c5c 100644 --- a/tests/tts_tests/test_align_tts_train.py +++ b/tests/tts_tests/test_align_tts_train.py @@ -23,6 +23,9 @@ config = AlignTTSConfig( epochs=1, print_step=1, print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], ) config.audio.do_trim_silence = True config.audio.trim_db = 60 diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py index bd119b9c..c4d57edd 100644 --- a/tests/tts_tests/test_glow_tts_train.py +++ b/tests/tts_tests/test_glow_tts_train.py @@ -23,6 +23,9 @@ config = GlowTTSConfig( epochs=1, print_step=1, print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], ) config.audio.do_trim_silence = True config.audio.trim_db = 60 diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index 9977864e..bf635bc9 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -23,6 +23,9 @@ config = SpeedySpeechConfig( epochs=1, print_step=1, print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], ) config.audio.do_trim_silence = True config.audio.trim_db = 60 diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py index face77ae..70975490 100644 --- a/tests/tts_tests/test_tacotron2_train.py +++ b/tests/tts_tests/test_tacotron2_train.py @@ -23,6 +23,9 @@ config = Tacotron2Config( test_delay_epochs=-1, epochs=1, print_step=1, + test_sentences=[ + "Be a voice, not an echo.", + ], print_eval=True, max_decoder_steps=50, ) diff --git a/tests/tts_tests/test_tacotron_train.py b/tests/tts_tests/test_tacotron_train.py index 9443d73a..010154e2 100644 --- a/tests/tts_tests/test_tacotron_train.py +++ b/tests/tts_tests/test_tacotron_train.py @@ -22,6 +22,9 @@ config = TacotronConfig( test_delay_epochs=-1, epochs=1, print_step=1, + test_sentences=[ + "Be a voice, not an echo.", + ], print_eval=True, r=5, max_decoder_steps=50, From 4f29725eb609815a8abcf518832c5e3e346fb43b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 6 Jun 2021 13:42:35 +0200 Subject: [PATCH 072/258] fix glow-tts `inference()` --- TTS/tts/models/glow_tts.py | 5 +++- TTS/tts/utils/synthesis.py | 31 +++++++++++++++++++++-- tests/inference_tests/test_synthesizer.py | 3 ++- tests/tts_tests/test_tacotron2_model.py | 4 +-- tests/tts_tests/test_tacotron2_train.py | 1 - 5 files changed, 36 insertions(+), 8 deletions(-) diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 9c928a67..3b3207f0 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -290,7 +290,10 @@ class GlowTTS(nn.Module): return outputs @torch.no_grad() - def inference(self, x, x_lengths, cond_input={"d_vectors": None}): # pylint: disable=dangerous-default-value + def inference( + self, x, cond_input={"x_lengths": None, "d_vectors": None} + ): # pylint: disable=dangerous-default-value + x_lengths = cond_input["x_lengths"] g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None if g is not None: if self.d_vector_dim: diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 0cb8df38..8e026b66 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -1,8 +1,10 @@ import os +from typing import Dict import numpy as np import pkg_resources import torch +from torch import nn from .text import phoneme_to_sequence, text_to_sequence @@ -64,9 +66,34 @@ def compute_style_mel(style_wav, ap, cuda=False): return style_mel -def run_model_torch(model, inputs, speaker_id=None, style_mel=None, d_vector=None): +def run_model_torch( + model: nn.Module, + inputs: torch.Tensor, + speaker_id: int = None, + style_mel: torch.Tensor = None, + d_vector: torch.Tensor = None, +) -> Dict: + """Run a torch model for inference. It does not support batch inference. + + Args: + model (nn.Module): The model to run inference. + inputs (torch.Tensor): Input tensor with character ids. + speaker_id (int, optional): Input speaker ids for multi-speaker models. Defaults to None. + style_mel (torch.Tensor, optional): Spectrograms used for voice styling . Defaults to None. + d_vector (torch.Tensor, optional): d-vector for multi-speaker models . Defaults to None. + + Returns: + Dict: model outputs. + """ + input_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) outputs = model.inference( - inputs, cond_input={"speaker_ids": speaker_id, "d_vector": d_vector, "style_mel": style_mel} + inputs, + cond_input={ + "x_lengths": input_lengths, + "speaker_ids": speaker_id, + "d_vectors": d_vector, + "style_mel": style_mel, + }, ) return outputs diff --git a/tests/inference_tests/test_synthesizer.py b/tests/inference_tests/test_synthesizer.py index b0fa22d3..4379c8ca 100644 --- a/tests/inference_tests/test_synthesizer.py +++ b/tests/inference_tests/test_synthesizer.py @@ -1,13 +1,14 @@ import os import unittest -from tests import get_tests_output_path from TTS.config import load_config from TTS.tts.models import setup_model from TTS.tts.utils.io import save_checkpoint from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.utils.synthesizer import Synthesizer +from .. import get_tests_output_path + class SynthesizerTest(unittest.TestCase): # pylint: disable=R0201 diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index b77f7cc5..66372470 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -259,9 +259,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55, use_gst=True, gst=c.gst).to( - device - ) + model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55, use_gst=True, gst=c.gst).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py index 70975490..577de014 100644 --- a/tests/tts_tests/test_tacotron2_train.py +++ b/tests/tts_tests/test_tacotron2_train.py @@ -8,7 +8,6 @@ from TTS.tts.configs import Tacotron2Config config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") - config = Tacotron2Config( r=5, batch_size=8, From 614738cc85ca0cffa29431611fa1ee4463e43909 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 6 Jun 2021 13:42:52 +0200 Subject: [PATCH 073/258] downsize melgan test model size --- TTS/tts/datasets/formatters.py | 1 - tests/vocoder_tests/test_melgan_train.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 3cb37168..db7841f4 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -112,7 +112,6 @@ def ljspeech_test(root_path, meta_file): https://keithito.com/LJ-Speech-Dataset/""" txt_file = os.path.join(root_path, meta_file) items = [] - speaker_name = "ljspeech" with open(txt_file, "r", encoding="utf-8") as ttf: for idx, line in enumerate(ttf): cols = line.split("|") diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py index 3ff65b5a..551b786a 100644 --- a/tests/vocoder_tests/test_melgan_train.py +++ b/tests/vocoder_tests/test_melgan_train.py @@ -19,7 +19,7 @@ config = MelganConfig( seq_len=2048, eval_split_size=1, print_step=1, - discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, + discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, print_eval=True, data_path="tests/data/ljspeech", output_path=output_path, From 59be1b9af1d21d0f1f4fbc2672265e05b3f6a964 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 6 Jun 2021 15:20:17 +0200 Subject: [PATCH 074/258] adjust `distribute.py` for the `train_tts.py` --- TTS/bin/distribute.py | 3 +- TTS/trainer.py | 52 +++++++++++++------ TTS/tts/utils/synthesis.py | 6 ++- .../ljspeech/tacotron2-DDC/tacotron2-DDC.json | 7 ++- 4 files changed, 48 insertions(+), 20 deletions(-) diff --git a/TTS/bin/distribute.py b/TTS/bin/distribute.py index ea43f88b..20d4bb20 100644 --- a/TTS/bin/distribute.py +++ b/TTS/bin/distribute.py @@ -30,7 +30,7 @@ def main(): parser.add_argument( "--config_path", type=str, help="Path to config file for training.", required="--continue_path" not in sys.argv ) - args = parser.parse_args() + args, unargs = parser.parse_known_args() num_gpus = torch.cuda.device_count() group_id = time.strftime("%Y_%m_%d-%H%M%S") @@ -42,6 +42,7 @@ def main(): command.append("--restore_path={}".format(args.restore_path)) command.append("--config_path={}".format(args.config_path)) command.append("--group_id=group_{}".format(group_id)) + command += unargs command.append("") # run processes diff --git a/TTS/trainer.py b/TTS/trainer.py index 9fe2f108..76c741b1 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -65,15 +65,19 @@ class TrainerTTS: self, args: Union[Coqpit, Namespace], config: Coqpit, - c_logger: ConsoleLogger, - tb_logger: TensorboardLogger, + c_logger: ConsoleLogger = None, + tb_logger: TensorboardLogger = None, model: nn.Module = None, output_path: str = None, ) -> None: self.args = args self.config = config - self.c_logger = c_logger - self.tb_logger = tb_logger + self.c_logger = ConsoleLogger() if c_logger is None else c_logger + if tb_logger is None: + self.tb_logger = TensorboardLogger(output_path, model_name=config.model) + self.tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) + else: + self.tb_logger = tb_logger self.output_path = output_path self.total_steps_done = 0 @@ -117,20 +121,20 @@ class TrainerTTS: # setup criterion self.criterion = self.get_criterion(self.config) - if self.use_cuda: - self.model.cuda() - self.criterion.cuda() - # DISTRUBUTED if self.num_gpus > 1: init_distributed( args.rank, self.num_gpus, args.group_id, - self.config.distributed["backend"], - self.config.distributed["url"], + self.config.distributed_backend, + self.config.distributed_url, ) + if self.use_cuda: + self.model.cuda() + self.criterion.cuda() + # scalers for mixed precision training self.scaler = torch.cuda.amp.GradScaler() if self.config.mixed_precision and self.use_cuda else None @@ -147,7 +151,7 @@ class TrainerTTS: # DISTRUBUTED if self.num_gpus > 1: - self.model = DDP_th(self.model, device_ids=[args.rank]) + self.model = DDP_th(self.model, device_ids=[args.rank], output_device=args.rank) # count model size num_params = count_parameters(self.model) @@ -377,6 +381,11 @@ class TrainerTTS: "item_idx": item_idx, } + def _train_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: + if hasattr(self.model, "module"): + return self.model.module.train_step(batch, criterion) + return self.model.train_step(batch, criterion) + def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_time: float) -> Tuple[Dict, Dict]: self.on_train_step_start() step_start_time = time.time() @@ -389,7 +398,7 @@ class TrainerTTS: self.optimizer.zero_grad() with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): - outputs, loss_dict = self.model.train_step(batch, self.criterion) + outputs, loss_dict = self._train_step(batch, self.criterion) # check nan loss if torch.isnan(loss_dict["loss"]).any(): @@ -473,7 +482,10 @@ class TrainerTTS: scaler=self.scaler.state_dict() if self.config.mixed_precision else None, ) # training visualizations - figures, audios = self.model.train_log(self.ap, batch, outputs) + if hasattr(self.model, "module"): + figures, audios = self.model.module.train_log(self.ap, batch, outputs) + else: + figures, audios = self.model.train_log(self.ap, batch, outputs) self.tb_logger.tb_train_figures(self.total_steps_done, figures) self.tb_logger.tb_train_audios(self.total_steps_done, {"TrainAudio": audios}, self.ap.sample_rate) self.total_steps_done += 1 @@ -500,12 +512,17 @@ class TrainerTTS: if self.config.tb_model_param_stats: self.tb_logger.tb_model_weights(self.model, self.total_steps_done) + def _eval_step(self, batch: Dict) -> Tuple[Dict, Dict]: + if hasattr(self.model, "module"): + return self.model.module.eval_step(batch, self.criterion) + return self.model.eval_step(batch, self.criterion) + def eval_step(self, batch: Dict, step: int) -> Tuple[Dict, Dict]: with torch.no_grad(): step_start_time = time.time() with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): - outputs, loss_dict = self.model.eval_step(batch, self.criterion) + outputs, loss_dict = self._eval_step(batch) step_time = time.time() - step_start_time @@ -542,7 +559,10 @@ class TrainerTTS: outputs, _ = self.eval_step(batch, cur_step) # Plot epoch stats and samples from the last batch. if self.args.rank == 0: - figures, eval_audios = self.model.eval_log(self.ap, batch, outputs) + if hasattr(self.model, "module"): + figures, eval_audios = self.model.module.eval_log(self.ap, batch, outputs) + else: + figures, eval_audios = self.model.eval_log(self.ap, batch, outputs) self.tb_logger.tb_eval_figures(self.total_steps_done, figures) self.tb_logger.tb_eval_audios(self.total_steps_done, {"EvalAudio": eval_audios}, self.ap.sample_rate) @@ -642,7 +662,7 @@ class TrainerTTS: self.train_epoch() if self.config.run_eval: self.eval_epoch() - if epoch >= self.config.test_delay_epochs: + if epoch >= self.config.test_delay_epochs and self.args.rank < 0: self.test_run() self.c_logger.print_epoch_end( epoch, self.keep_avg_eval.avg_values if self.config.run_eval else self.keep_avg_train.avg_values diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 8e026b66..047d67f0 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -86,7 +86,11 @@ def run_model_torch( Dict: model outputs. """ input_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) - outputs = model.inference( + if hasattr(model, "module"): + _func = model.module.inference + else: + _func = model.inference + outputs = _func( inputs, cond_input={ "x_lengths": input_lengths, diff --git a/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json b/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json index 9cdbbd3b..e3531851 100644 --- a/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json +++ b/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json @@ -36,12 +36,14 @@ "gst_num_heads": 4, "gst_num_style_tokens": 10 }, + "distributed_backend": "gloo", + "distributed_url": "tcp:\/\/localhost:54321", "model": "Tacotron2", "run_name": "ljspeech-ddc", "run_description": "tacotron2 with double decoder consistency.", "batch_size": 64, "eval_batch_size": 16, - "mixed_precision": true, + "mixed_precision": false, "loss_masking": true, "decoder_loss_alpha": 0.25, "postnet_loss_alpha": 0.25, @@ -54,6 +56,7 @@ "run_eval": true, "test_delay_epochs": 10, "test_sentences_file": null, + "max_decoder_steps": 50, "noam_schedule": true, "grad_clip": 0.05, "epochs": 1000, @@ -88,4 +91,4 @@ "phoneme_cache_path": "DEFINE THIS", "use_phonemes": false, "phoneme_language": "en-us" -} \ No newline at end of file +} From 4575b7082683f638705b8553668ceb26e2e86b3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 7 Jun 2021 15:11:33 +0200 Subject: [PATCH 075/258] merge if branches with the same implementation --- TTS/tts/utils/synthesis.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 047d67f0..fd9a75cd 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -247,15 +247,11 @@ def synthesis( style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=use_cuda) text_inputs = text_inputs.unsqueeze(0) - elif backend == "tf": + elif backend in ["tf", "tflite"]: # TODO: handle speaker id for tf model style_mel = numpy_to_tf(style_mel, tf.float32) text_inputs = numpy_to_tf(text_inputs, tf.int32) text_inputs = tf.expand_dims(text_inputs, 0) - elif backend == "tflite": - style_mel = numpy_to_tf(style_mel, tf.float32) - text_inputs = numpy_to_tf(text_inputs, tf.int32) - text_inputs = tf.expand_dims(text_inputs, 0) # synthesize voice if backend == "torch": outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, d_vector=d_vector) From f077a356e0c59b1770a977323558a3062da9d269 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 7 Jun 2021 16:08:56 +0200 Subject: [PATCH 076/258] rename to --- TTS/bin/extract_tts_spectrograms.py | 6 ++--- TTS/trainer.py | 14 +++++----- TTS/tts/models/align_tts.py | 12 ++++----- TTS/tts/models/glow_tts.py | 22 +++++++-------- TTS/tts/models/speedy_speech.py | 12 ++++----- TTS/tts/models/tacotron.py | 26 +++++++++--------- TTS/tts/models/tacotron2.py | 28 ++++++++++---------- TTS/tts/models/tacotron_abstract.py | 6 ++--- TTS/tts/utils/synthesis.py | 2 +- TTS/utils/generic_utils.py | 2 +- recipes/kokoro/tacotron2-DDC/run.sh | 10 +++---- recipes/ljspeech/tacotron2-DDC/run.sh | 8 +++--- tests/tts_tests/test_speedy_speech_layers.py | 4 +-- tests/tts_tests/test_tacotron2_model.py | 10 +++---- tests/tts_tests/test_tacotron_model.py | 10 +++---- 15 files changed, 85 insertions(+), 87 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 79021bf1..c5ba1b2a 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -136,14 +136,14 @@ def inference( speaker_c = d_vectors outputs = model.inference_with_MAS( - text_input, text_lengths, mel_input, mel_lengths, cond_input={"d_vectors": speaker_c} + text_input, text_lengths, mel_input, mel_lengths, aux_input={"d_vectors": speaker_c} ) model_output = outputs["model_outputs"] model_output = model_output.transpose(1, 2).detach().cpu().numpy() elif "tacotron" in model_name: - cond_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} - outputs = model(text_input, text_lengths, mel_input, mel_lengths, cond_input) + aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} + outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input) postnet_outputs = outputs["model_outputs"] # normalize tacotron output if model_name == "tacotron": diff --git a/TTS/trainer.py b/TTS/trainer.py index 76c741b1..c1d1c340 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -573,7 +573,7 @@ class TrainerTTS: test_audios = {} test_figures = {} test_sentences = self.config.test_sentences - cond_inputs = self._get_cond_inputs() + aux_inputs = self._get_aux_inputs() for idx, sen in enumerate(test_sentences): wav, alignment, model_outputs, _ = synthesis( self.model, @@ -581,9 +581,9 @@ class TrainerTTS: self.config, self.use_cuda, self.ap, - speaker_id=cond_inputs["speaker_id"], - d_vector=cond_inputs["d_vector"], - style_wav=cond_inputs["style_wav"], + speaker_id=aux_inputs["speaker_id"], + d_vector=aux_inputs["d_vector"], + style_wav=aux_inputs["style_wav"], enable_eos_bos_chars=self.config.enable_eos_bos_chars, use_griffin_lim=True, do_trim_silence=False, @@ -600,7 +600,7 @@ class TrainerTTS: self.tb_logger.tb_test_audios(self.total_steps_done, test_audios, self.config.audio["sample_rate"]) self.tb_logger.tb_test_figures(self.total_steps_done, test_figures) - def _get_cond_inputs(self) -> Dict: + def _get_aux_inputs(self) -> Dict: # setup speaker_id speaker_id = 0 if self.config.use_speaker_embedding else None # setup d_vector @@ -620,8 +620,8 @@ class TrainerTTS: print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") for i in range(self.config.gst["gst_num_style_tokens"]): style_wav[str(i)] = 0 - cond_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector} - return cond_inputs + aux_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector} + return aux_inputs def fit(self) -> None: if self.restore_step != 0 or self.args.best_path: diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 20b0cdf7..6c268a43 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -212,7 +212,7 @@ class AlignTTS(nn.Module): return dr_mas, mu, log_sigma, logp def forward( - self, x, x_lengths, y, y_lengths, cond_input={"d_vectors": None}, phase=None + self, x, x_lengths, y, y_lengths, aux_input={"d_vectors": None}, phase=None ): # pylint: disable=unused-argument """ Shapes: @@ -223,7 +223,7 @@ class AlignTTS(nn.Module): g: [B, C] """ y = y.transpose(1, 2) - g = cond_input["d_vectors"] if "d_vectors" in cond_input else None + g = aux_input["d_vectors"] if "d_vectors" in aux_input else None o_de, o_dr_log, dr_mas_log, attn, mu, log_sigma, logp = None, None, None, None, None, None, None if phase == 0: # train encoder and MDN @@ -267,14 +267,14 @@ class AlignTTS(nn.Module): return outputs @torch.no_grad() - def inference(self, x, cond_input={"d_vectors": None}): # pylint: disable=unused-argument + def inference(self, x, aux_input={"d_vectors": None}): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] x_lengths: [B] g: [B, C] """ - g = cond_input["d_vectors"] if "d_vectors" in cond_input else None + g = aux_input["d_vectors"] if "d_vectors" in aux_input else None x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # pad input to prevent dropping the last word # x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0) @@ -296,8 +296,8 @@ class AlignTTS(nn.Module): d_vectors = batch["d_vectors"] speaker_ids = batch["speaker_ids"] - cond_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids} - outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input, self.phase) + aux_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids} + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input, self.phase) loss_dict = criterion( outputs["logp"], outputs["model_outputs"], diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 3b3207f0..e61b80c2 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -144,7 +144,7 @@ class GlowTTS(nn.Module): return y_mean, y_log_scale, o_attn_dur def forward( - self, x, x_lengths, y, y_lengths=None, cond_input={"d_vectors": None} + self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None} ): # pylint: disable=dangerous-default-value """ Shapes: @@ -157,7 +157,7 @@ class GlowTTS(nn.Module): y = y.transpose(1, 2) y_max_length = y.size(2) # norm speaker embeddings - g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None + g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None if g is not None: if self.d_vector_dim: g = F.normalize(g).unsqueeze(-1) @@ -197,7 +197,7 @@ class GlowTTS(nn.Module): @torch.no_grad() def inference_with_MAS( - self, x, x_lengths, y=None, y_lengths=None, cond_input={"d_vectors": None} + self, x, x_lengths, y=None, y_lengths=None, aux_input={"d_vectors": None} ): # pylint: disable=dangerous-default-value """ It's similar to the teacher forcing in Tacotron. @@ -212,7 +212,7 @@ class GlowTTS(nn.Module): y = y.transpose(1, 2) y_max_length = y.size(2) # norm speaker embeddings - g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None + g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None if g is not None: if self.external_d_vector_dim: g = F.normalize(g).unsqueeze(-1) @@ -258,7 +258,7 @@ class GlowTTS(nn.Module): @torch.no_grad() def decoder_inference( - self, y, y_lengths=None, cond_input={"d_vectors": None} + self, y, y_lengths=None, aux_input={"d_vectors": None} ): # pylint: disable=dangerous-default-value """ Shapes: @@ -268,7 +268,7 @@ class GlowTTS(nn.Module): """ y = y.transpose(1, 2) y_max_length = y.size(2) - g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None + g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None # norm speaker embeddings if g is not None: if self.external_d_vector_dim: @@ -290,11 +290,9 @@ class GlowTTS(nn.Module): return outputs @torch.no_grad() - def inference( - self, x, cond_input={"x_lengths": None, "d_vectors": None} - ): # pylint: disable=dangerous-default-value - x_lengths = cond_input["x_lengths"] - g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None + def inference(self, x, aux_input={"x_lengths": None, "d_vectors": None}): # pylint: disable=dangerous-default-value + x_lengths = aux_input["x_lengths"] + g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None if g is not None: if self.d_vector_dim: g = F.normalize(g).unsqueeze(-1) @@ -343,7 +341,7 @@ class GlowTTS(nn.Module): mel_lengths = batch["mel_lengths"] d_vectors = batch["d_vectors"] - outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input={"d_vectors": d_vectors}) + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input={"d_vectors": d_vectors}) loss_dict = criterion( outputs["model_outputs"], diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index 53f7bbaa..d4a90a2e 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -157,7 +157,7 @@ class SpeedySpeech(nn.Module): return o_de, attn.transpose(1, 2) def forward( - self, x, x_lengths, y_lengths, dr, cond_input={"d_vectors": None, "speaker_ids": None} + self, x, x_lengths, y_lengths, dr, aux_input={"d_vectors": None, "speaker_ids": None} ): # pylint: disable=unused-argument """ TODO: speaker embedding for speaker_ids @@ -168,21 +168,21 @@ class SpeedySpeech(nn.Module): dr: [B, T_max] g: [B, C] """ - g = cond_input["d_vectors"] if "d_vectors" in cond_input else None + g = aux_input["d_vectors"] if "d_vectors" in aux_input else None o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) o_de, attn = self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g) outputs = {"model_outputs": o_de.transpose(1, 2), "durations_log": o_dr_log.squeeze(1), "alignments": attn} return outputs - def inference(self, x, cond_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=unused-argument + def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] x_lengths: [B] g: [B, C] """ - g = cond_input["d_vectors"] if "d_vectors" in cond_input else None + g = aux_input["d_vectors"] if "d_vectors" in aux_input else None x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # input sequence should be greated than the max convolution size inference_padding = 5 @@ -208,8 +208,8 @@ class SpeedySpeech(nn.Module): speaker_ids = batch["speaker_ids"] durations = batch["durations"] - cond_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids} - outputs = self.forward(text_input, text_lengths, mel_lengths, durations, cond_input) + aux_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids} + outputs = self.forward(text_input, text_lengths, mel_lengths, durations, aux_input) # compute loss loss_dict = criterion( diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 3ee70431..317d1905 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -186,14 +186,14 @@ class Tacotron(TacotronAbstract): max_decoder_steps, ) - def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, cond_input=None): + def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, aux_input=None): """ Shapes: text: [B, T_in] text_lengths: [B] mel_specs: [B, T_out, C] mel_lengths: [B] - cond_input: 'speaker_ids': [B, 1] and 'd_vectors':[B, C] + aux_input: 'speaker_ids': [B, 1] and 'd_vectors':[B, C] """ outputs = {"alignments_backward": None, "decoder_outputs_backward": None} inputs = self.embedding(text) @@ -205,15 +205,15 @@ class Tacotron(TacotronAbstract): # global style token if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["d_vectors"]) + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, aux_input["d_vectors"]) # speaker embedding if self.num_speakers > 1: if not self.use_d_vectors: # B x 1 x speaker_embed_dim - embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[:, None] + embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])[:, None] else: # B x 1 x speaker_embed_dim - embedded_speakers = torch.unsqueeze(cond_input["d_vectors"], 1) + embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1) encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) # decoder_outputs: B x decoder_in_features x T_out # alignments: B x T_in x encoder_in_features @@ -252,17 +252,17 @@ class Tacotron(TacotronAbstract): return outputs @torch.no_grad() - def inference(self, text_input, cond_input=None): - cond_input = self._format_cond_input(cond_input) + def inference(self, text_input, aux_input=None): + aux_input = self._format_aux_input(aux_input) inputs = self.embedding(text_input) encoder_outputs = self.encoder(inputs) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["d_vectors"]) + encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"]) if self.num_speakers > 1: if not self.use_d_vectors: # B x 1 x speaker_embed_dim - embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"]) + embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"]) # reshape embedded_speakers if embedded_speakers.ndim == 1: embedded_speakers = embedded_speakers[None, None, :] @@ -270,7 +270,7 @@ class Tacotron(TacotronAbstract): embedded_speakers = embedded_speakers[None, :] else: # B x 1 x speaker_embed_dim - embedded_speakers = torch.unsqueeze(cond_input["d_vectors"], 1) + embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1) encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) postnet_outputs = self.postnet(decoder_outputs) @@ -306,7 +306,7 @@ class Tacotron(TacotronAbstract): text_lengths, mel_input, mel_lengths, - cond_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors}, + aux_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors}, ) # set the [alignment] lengths wrt reduction factor for guided attention @@ -317,8 +317,8 @@ class Tacotron(TacotronAbstract): else: alignment_lengths = mel_lengths // self.decoder.r - cond_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} - outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input) + aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input) # compute loss loss_dict = criterion( diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index f6e59542..d56bd988 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -186,16 +186,16 @@ class Tacotron2(TacotronAbstract): mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2) return mel_outputs, mel_outputs_postnet, alignments - def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, cond_input=None): + def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, aux_input=None): """ Shapes: text: [B, T_in] text_lengths: [B] mel_specs: [B, T_out, C] mel_lengths: [B] - cond_input: 'speaker_ids': [B, 1] and 'd_vectors':[B, C] + aux_input: 'speaker_ids': [B, 1] and 'd_vectors':[B, C] """ - cond_input = self._format_cond_input(cond_input) + aux_input = self._format_aux_input(aux_input) outputs = {"alignments_backward": None, "decoder_outputs_backward": None} # compute mask for padding # B x T_in_max (boolean) @@ -206,14 +206,14 @@ class Tacotron2(TacotronAbstract): encoder_outputs = self.encoder(embedded_inputs, text_lengths) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["d_vectors"]) + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, aux_input["d_vectors"]) if self.num_speakers > 1: if not self.use_d_vectors: # B x 1 x speaker_embed_dim - embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[:, None] + embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])[:, None] else: # B x 1 x speaker_embed_dim - embedded_speakers = torch.unsqueeze(cond_input["d_vectors"], 1) + embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1) encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) @@ -252,24 +252,24 @@ class Tacotron2(TacotronAbstract): return outputs @torch.no_grad() - def inference(self, text, cond_input=None): - cond_input = self._format_cond_input(cond_input) + def inference(self, text, aux_input=None): + aux_input = self._format_aux_input(aux_input) embedded_inputs = self.embedding(text).transpose(1, 2) encoder_outputs = self.encoder.inference(embedded_inputs) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["d_vectors"]) + encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"]) if self.num_speakers > 1: if not self.use_d_vectors: - embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[None] + embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])[None] # reshape embedded_speakers if embedded_speakers.ndim == 1: embedded_speakers = embedded_speakers[None, None, :] elif embedded_speakers.ndim == 2: embedded_speakers = embedded_speakers[None, :] else: - embedded_speakers = cond_input["d_vectors"] + embedded_speakers = aux_input["d_vectors"] encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) @@ -307,7 +307,7 @@ class Tacotron2(TacotronAbstract): text_lengths, mel_input, mel_lengths, - cond_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors}, + aux_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors}, ) # set the [alignment] lengths wrt reduction factor for guided attention @@ -318,8 +318,8 @@ class Tacotron2(TacotronAbstract): else: alignment_lengths = mel_lengths // self.decoder.r - cond_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} - outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input) + aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input) # compute loss loss_dict = criterion( diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py index e480e2e0..705ea5bc 100644 --- a/TTS/tts/models/tacotron_abstract.py +++ b/TTS/tts/models/tacotron_abstract.py @@ -6,7 +6,7 @@ import torch from torch import nn from TTS.tts.utils.data import sequence_mask -from TTS.utils.generic_utils import format_cond_input +from TTS.utils.generic_utils import format_aux_input from TTS.utils.training import gradual_training_scheduler @@ -97,8 +97,8 @@ class TacotronAbstract(ABC, nn.Module): self.coarse_decoder = None @staticmethod - def _format_cond_input(cond_input: Dict) -> Dict: - return format_cond_input({"d_vectors": None, "speaker_ids": None}, cond_input) + def _format_aux_input(aux_input: Dict) -> Dict: + return format_aux_input({"d_vectors": None, "speaker_ids": None}, aux_input) ############################# # INIT FUNCTIONS diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index fd9a75cd..eba3916d 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -92,7 +92,7 @@ def run_model_torch( _func = model.inference outputs = _func( inputs, - cond_input={ + aux_input={ "x_lengths": input_lengths, "speaker_ids": speaker_id, "d_vectors": d_vector, diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index a1abf5fe..67cd0bf5 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -136,7 +136,7 @@ def set_init_dict(model_dict, checkpoint_state, c): return model_dict -def format_cond_input(def_args: Dict, kwargs: Dict) -> Dict: +def format_aux_input(def_args: Dict, kwargs: Dict) -> Dict: """Format kwargs to hande auxilary inputs to models. Args: diff --git a/recipes/kokoro/tacotron2-DDC/run.sh b/recipes/kokoro/tacotron2-DDC/run.sh index 86fda642..69800cf7 100644 --- a/recipes/kokoro/tacotron2-DDC/run.sh +++ b/recipes/kokoro/tacotron2-DDC/run.sh @@ -16,8 +16,8 @@ tail -n 812 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_val.c python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/$CORPUS/wavs/ # training .... # change the GPU id if needed -CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tacotron.py --config_path $RUN_DIR/tacotron2-DDC.json \ - --coqpit.output_path $RUN_DIR \ - --coqpit.datasets.0.path $RUN_DIR/$CORPUS \ - --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ - --coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \ \ No newline at end of file +CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path $RUN_DIR/tacotron2-DDC.json \ + --coqpit.output_path $RUN_DIR \ + --coqpit.datasets.0.path $RUN_DIR/$CORPUS \ + --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ + --coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \ \ No newline at end of file diff --git a/recipes/ljspeech/tacotron2-DDC/run.sh b/recipes/ljspeech/tacotron2-DDC/run.sh index eaa05b60..dd36454f 100644 --- a/recipes/ljspeech/tacotron2-DDC/run.sh +++ b/recipes/ljspeech/tacotron2-DDC/run.sh @@ -16,7 +16,7 @@ rm LJSpeech-1.1.tar.bz2 python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/LJSpeech-1.1/wavs/ # training .... # change the GPU id if needed -CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tacotron.py --config_path $RUN_DIR/tacotron2-DDC.json \ - --coqpit.output_path $RUN_DIR \ - --coqpit.datasets.0.path $RUN_DIR/LJSpeech-1.1/ \ - --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ \ No newline at end of file +CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path $RUN_DIR/tacotron2-DDC.json \ + --coqpit.output_path $RUN_DIR \ + --coqpit.datasets.0.path $RUN_DIR/LJSpeech-1.1/ \ + --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ \ No newline at end of file diff --git a/tests/tts_tests/test_speedy_speech_layers.py b/tests/tts_tests/test_speedy_speech_layers.py index 7c4f0adf..d2f62d49 100644 --- a/tests/tts_tests/test_speedy_speech_layers.py +++ b/tests/tts_tests/test_speedy_speech_layers.py @@ -57,7 +57,7 @@ def test_speedy_speech(): # with speaker embedding model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128, num_speakers=10, c_in_channels=256).to(device) model.forward( - x_dummy, x_lengths, y_lengths, durations, cond_input={"d_vectors": torch.randint(0, 10, (B,)).to(device)} + x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.randint(0, 10, (B,)).to(device)} ) o_de = outputs["model_outputs"] attn = outputs["alignments"] @@ -71,7 +71,7 @@ def test_speedy_speech(): model = SpeedySpeech( num_chars, out_channels=80, hidden_channels=128, num_speakers=10, external_c=True, c_in_channels=256 ).to(device) - model.forward(x_dummy, x_lengths, y_lengths, durations, cond_input={"d_vectors": torch.rand((B, 256)).to(device)}) + model.forward(x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.rand((B, 256)).to(device)}) o_de = outputs["model_outputs"] attn = outputs["alignments"] o_dr = outputs["durations_log"] diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index 66372470..fc3d9799 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -53,7 +53,7 @@ class TacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} ) assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 @@ -105,7 +105,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_ids} + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_ids} ) assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 @@ -158,7 +158,7 @@ class TacotronGSTTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(10): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} ) assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 @@ -214,7 +214,7 @@ class TacotronGSTTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(10): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} ) assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 @@ -269,7 +269,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_embeddings} + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings} ) assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py index 31682d7a..2abd968d 100644 --- a/tests/tts_tests/test_tacotron_model.py +++ b/tests/tts_tests/test_tacotron_model.py @@ -69,7 +69,7 @@ class TacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} ) optimizer.zero_grad() loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) @@ -130,7 +130,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_embeddings} + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings} ) optimizer.zero_grad() loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) @@ -194,7 +194,7 @@ class TacotronGSTTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(10): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} ) optimizer.zero_grad() loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) @@ -257,7 +257,7 @@ class TacotronGSTTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(10): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} ) optimizer.zero_grad() loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) @@ -319,7 +319,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_embeddings} + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings} ) optimizer.zero_grad() loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) From 64f0f57757b7d0729d2049d3f28ded8ddfeb34e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 8 Jun 2021 14:18:18 +0200 Subject: [PATCH 077/258] `TrainerAbstract` and related updates for `TrainerTTS` --- TTS/bin/train_tts.py | 2 +- TTS/trainer.py | 714 ++++------------------------------------- TTS/tts/trainer_tts.py | 709 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 771 insertions(+), 654 deletions(-) create mode 100644 TTS/tts/trainer_tts.py diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 3270d0e0..06765906 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -2,7 +2,7 @@ import os import sys import traceback -from TTS.trainer import TrainerTTS +from TTS.tts.trainer_tts import TrainerTTS from TTS.utils.arguments import init_training from TTS.utils.generic_utils import remove_experiment_folder diff --git a/TTS/trainer.py b/TTS/trainer.py index c1d1c340..5c02fdfb 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -1,39 +1,23 @@ # -*- coding: utf-8 -*- import importlib -import logging -import os -import time -from argparse import Namespace +from abc import ABC, abstractmethod from dataclasses import dataclass, field -from typing import Dict, List, Tuple, Union +from typing import Dict, List, Tuple, TypeVar import torch from coqpit import Coqpit # DISTRIBUTED from torch import nn -from torch.nn.parallel import DistributedDataParallel as DDP_th -from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler -from TTS.tts.datasets import TTSDataset, load_meta_data -from TTS.tts.layers import setup_loss -from TTS.tts.models import setup_model -from TTS.tts.utils.io import save_best_model, save_checkpoint -from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager -from TTS.tts.utils.synthesis import synthesis -from TTS.tts.utils.text.symbols import make_symbols -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import init_distributed -from TTS.utils.generic_utils import KeepAverage, count_parameters, set_init_dict, to_cuda -from TTS.utils.logging import ConsoleLogger, TensorboardLogger -from TTS.utils.training import check_update, setup_torch_training_env +_DataLoader = TypeVar("_DataLoader") @dataclass class TrainingArgs(Coqpit): + """Trainer arguments that are parsed externally (e.g. CLI)""" + continue_path: str = field( default="", metadata={ @@ -58,676 +42,100 @@ class TrainingArgs(Coqpit): # pylint: disable=import-outside-toplevel, too-many-public-methods -class TrainerTTS: - use_cuda, num_gpus = setup_torch_training_env(True, False) - def __init__( - self, - args: Union[Coqpit, Namespace], - config: Coqpit, - c_logger: ConsoleLogger = None, - tb_logger: TensorboardLogger = None, - model: nn.Module = None, - output_path: str = None, - ) -> None: - self.args = args - self.config = config - self.c_logger = ConsoleLogger() if c_logger is None else c_logger - if tb_logger is None: - self.tb_logger = TensorboardLogger(output_path, model_name=config.model) - self.tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) - else: - self.tb_logger = tb_logger - self.output_path = output_path - self.total_steps_done = 0 - self.epochs_done = 0 - self.restore_step = 0 - self.best_loss = float("inf") - self.train_loader = None - self.eval_loader = None - self.output_audio_path = os.path.join(output_path, "test_audios") - - self.keep_avg_train = None - self.keep_avg_eval = None - - log_file = os.path.join(self.output_path, f"trainer_{args.rank}_log.txt") - self._setup_logger_config(log_file) - - # model, audio processor, datasets, loss - # init audio processor - self.ap = AudioProcessor(**self.config.audio.to_dict()) - - # init character processor - self.model_characters = self.get_character_processor(self.config) - - # load dataset samples - self.data_train, self.data_eval = load_meta_data(self.config.datasets) - - # default speaker manager - self.speaker_manager = self.get_speaker_manager(self.config, args.restore_path, output_path, self.data_train) - - # init TTS model - if model is not None: - self.model = model - else: - self.model = self.get_model( - len(self.model_characters), - self.speaker_manager.num_speakers, - self.config, - self.speaker_manager.d_vector_dim if self.speaker_manager.d_vectors else None, - ) - - # setup criterion - self.criterion = self.get_criterion(self.config) - - # DISTRUBUTED - if self.num_gpus > 1: - init_distributed( - args.rank, - self.num_gpus, - args.group_id, - self.config.distributed_backend, - self.config.distributed_url, - ) - - if self.use_cuda: - self.model.cuda() - self.criterion.cuda() - - # scalers for mixed precision training - self.scaler = torch.cuda.amp.GradScaler() if self.config.mixed_precision and self.use_cuda else None - - # setup optimizer - self.optimizer = self.get_optimizer(self.model, self.config) - - if self.args.restore_path: - self.model, self.optimizer, self.scaler, self.restore_step = self.restore_model( - self.config, args.restore_path, self.model, self.optimizer, self.scaler - ) - - # setup scheduler - self.scheduler = self.get_scheduler(self.config, self.optimizer) - - # DISTRUBUTED - if self.num_gpus > 1: - self.model = DDP_th(self.model, device_ids=[args.rank], output_device=args.rank) - - # count model size - num_params = count_parameters(self.model) - print("\n > Model has {} parameters".format(num_params)) +class TrainerAbstract(ABC): @staticmethod - def get_model(num_chars: int, num_speakers: int, config: Coqpit, d_vector_dim: int) -> nn.Module: - model = setup_model(num_chars, num_speakers, config, d_vector_dim) - return model + def _is_apex_available(): + return importlib.util.find_spec("apex") is not None @staticmethod + @abstractmethod + def get_model(*args, **kwargs) -> nn.Module: + pass + + @staticmethod + @abstractmethod def get_optimizer(model: nn.Module, config: Coqpit) -> torch.optim.Optimizer: - optimizer_name = config.optimizer - optimizer_params = config.optimizer_params - if optimizer_name.lower() == "radam": - module = importlib.import_module("TTS.utils.radam") - optimizer = getattr(module, "RAdam") - else: - optimizer = getattr(torch.optim, optimizer_name) - return optimizer(model.parameters(), lr=config.lr, **optimizer_params) - - @staticmethod - def get_character_processor(config: Coqpit) -> str: - # setup custom characters if set in config file. - # TODO: implement CharacterProcessor - if config.characters is not None: - symbols, phonemes = make_symbols(**config.characters.to_dict()) - else: - from TTS.tts.utils.text.symbols import phonemes, symbols - model_characters = phonemes if config.use_phonemes else symbols - return model_characters - - @staticmethod - def get_speaker_manager( - config: Coqpit, restore_path: str = "", out_path: str = "", data_train: List = None - ) -> SpeakerManager: - speaker_manager = get_speaker_manager(config, restore_path, data_train, out_path) - return speaker_manager + pass @staticmethod + @abstractmethod def get_scheduler( config: Coqpit, optimizer: torch.optim.Optimizer ) -> torch.optim.lr_scheduler._LRScheduler: # pylint: disable=protected-access - lr_scheduler = config.lr_scheduler - lr_scheduler_params = config.lr_scheduler_params - if lr_scheduler is None: - return None - if lr_scheduler.lower() == "noamlr": - from TTS.utils.training import NoamLR - - scheduler = NoamLR - else: - scheduler = getattr(torch.optim, lr_scheduler) - return scheduler(optimizer, **lr_scheduler_params) + pass @staticmethod + @abstractmethod def get_criterion(config: Coqpit) -> nn.Module: - return setup_loss(config) + pass - def restore_model( - self, - config: Coqpit, - restore_path: str, - model: nn.Module, - optimizer: torch.optim.Optimizer, - scaler: torch.cuda.amp.GradScaler = None, - ) -> Tuple[nn.Module, torch.optim.Optimizer, torch.cuda.amp.GradScaler, int]: - print(" > Restoring from %s ..." % os.path.basename(restore_path)) - checkpoint = torch.load(restore_path) - try: - print(" > Restoring Model...") - model.load_state_dict(checkpoint["model"]) - print(" > Restoring Optimizer...") - optimizer.load_state_dict(checkpoint["optimizer"]) - if "scaler" in checkpoint and config.mixed_precision: - print(" > Restoring AMP Scaler...") - scaler.load_state_dict(checkpoint["scaler"]) - except (KeyError, RuntimeError): - print(" > Partial model initialization...") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], config) - model.load_state_dict(model_dict) - del model_dict + @abstractmethod + def restore_model(self, *args, **kwargs) -> Tuple: + pass - for group in optimizer.param_groups: - group["lr"] = self.config.lr - print( - " > Model restored from step %d" % checkpoint["step"], - ) - restore_step = checkpoint["step"] - return model, optimizer, scaler, restore_step + @abstractmethod + def get_train_dataloader(self, *args, **kwargs) -> _DataLoader: + pass - def _get_loader( - self, - r: int, - ap: AudioProcessor, - is_eval: bool, - data_items: List, - verbose: bool, - speaker_ids: Union[Dict, List], - d_vectors: Union[Dict, List], - ) -> DataLoader: - if is_eval and not self.config.run_eval: - loader = None - else: - dataset = TTSDataset( - outputs_per_step=r, - text_cleaner=self.config.text_cleaner, - compute_linear_spec=self.config.model.lower() == "tacotron", - meta_data=data_items, - ap=ap, - tp=self.config.characters, - add_blank=self.config["add_blank"], - batch_group_size=0 if is_eval else self.config.batch_group_size * self.config.batch_size, - min_seq_len=self.config.min_seq_len, - max_seq_len=self.config.max_seq_len, - phoneme_cache_path=self.config.phoneme_cache_path, - use_phonemes=self.config.use_phonemes, - phoneme_language=self.config.phoneme_language, - enable_eos_bos=self.config.enable_eos_bos_chars, - use_noise_augment=not is_eval, - verbose=verbose, - speaker_id_mapping=speaker_ids if self.config.use_speaker_embedding else None, - d_vector_mapping=d_vectors - if self.config.use_speaker_embedding and self.config.use_external_speaker_embedding_file - else None, - ) - - if self.config.use_phonemes and self.config.compute_input_seq_cache: - # precompute phonemes to have a better estimate of sequence lengths. - dataset.compute_input_seq(self.config.num_loader_workers) - dataset.sort_items() - - sampler = DistributedSampler(dataset) if self.num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=self.config.eval_batch_size if is_eval else self.config.batch_size, - shuffle=False, - collate_fn=dataset.collate_fn, - drop_last=False, - sampler=sampler, - num_workers=self.config.num_val_loader_workers if is_eval else self.config.num_loader_workers, - pin_memory=False, - ) - return loader - - def get_train_dataloader( - self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_ids: Dict, d_vectors: Dict - ) -> DataLoader: - return self._get_loader(r, ap, False, data_items, verbose, speaker_ids, d_vectors) - - def get_eval_dataloder( - self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_ids: Dict, d_vectors: Dict - ) -> DataLoader: - return self._get_loader(r, ap, True, data_items, verbose, speaker_ids, d_vectors) + @abstractmethod + def get_eval_dataloder(self, *args, **kwargs) -> _DataLoader: + pass + @abstractmethod def format_batch(self, batch: List) -> Dict: - # setup input batch - text_input = batch[0] - text_lengths = batch[1] - speaker_names = batch[2] - linear_input = batch[3] if self.config.model.lower() in ["tacotron"] else None - mel_input = batch[4] - mel_lengths = batch[5] - stop_targets = batch[6] - item_idx = batch[7] - d_vectors = batch[8] - speaker_ids = batch[9] - attn_mask = batch[10] - max_text_length = torch.max(text_lengths.float()) - max_spec_length = torch.max(mel_lengths.float()) - - # compute durations from attention masks - durations = None - if attn_mask is not None: - durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2]) - for idx, am in enumerate(attn_mask): - # compute raw durations - c_idxs = am[:, : text_lengths[idx], : mel_lengths[idx]].max(1)[1] - # c_idxs, counts = torch.unique_consecutive(c_idxs, return_counts=True) - c_idxs, counts = torch.unique(c_idxs, return_counts=True) - dur = torch.ones([text_lengths[idx]]).to(counts.dtype) - dur[c_idxs] = counts - # smooth the durations and set any 0 duration to 1 - # by cutting off from the largest duration indeces. - extra_frames = dur.sum() - mel_lengths[idx] - largest_idxs = torch.argsort(-dur)[:extra_frames] - dur[largest_idxs] -= 1 - assert ( - dur.sum() == mel_lengths[idx] - ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" - durations[idx, : text_lengths[idx]] = dur - - # set stop targets view, we predict a single stop token per iteration. - stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // self.config.r, -1) - stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) - - # dispatch batch to GPU - if self.use_cuda: - text_input = to_cuda(text_input) - text_lengths = to_cuda(text_lengths) - mel_input = to_cuda(mel_input) - mel_lengths = to_cuda(mel_lengths) - linear_input = to_cuda(linear_input) if self.config.model.lower() in ["tacotron"] else None - stop_targets = to_cuda(stop_targets) - attn_mask = to_cuda(attn_mask) if attn_mask is not None else None - durations = to_cuda(durations) if attn_mask is not None else None - if speaker_ids is not None: - speaker_ids = to_cuda(speaker_ids) - if d_vectors is not None: - d_vectors = to_cuda(d_vectors) - - return { - "text_input": text_input, - "text_lengths": text_lengths, - "speaker_names": speaker_names, - "mel_input": mel_input, - "mel_lengths": mel_lengths, - "linear_input": linear_input, - "stop_targets": stop_targets, - "attn_mask": attn_mask, - "durations": durations, - "speaker_ids": speaker_ids, - "d_vectors": d_vectors, - "max_text_length": max_text_length, - "max_spec_length": max_spec_length, - "item_idx": item_idx, - } + pass + @abstractmethod def _train_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: - if hasattr(self.model, "module"): - return self.model.module.train_step(batch, criterion) - return self.model.train_step(batch, criterion) + pass + @abstractmethod def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_time: float) -> Tuple[Dict, Dict]: - self.on_train_step_start() - step_start_time = time.time() - - # format data - batch = self.format_batch(batch) - loader_time = time.time() - loader_start_time - - # zero-out optimizer - self.optimizer.zero_grad() - - with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): - outputs, loss_dict = self._train_step(batch, self.criterion) - - # check nan loss - if torch.isnan(loss_dict["loss"]).any(): - raise RuntimeError(f"Detected NaN loss at step {self.total_steps_done}.") - - # optimizer step - if self.config.mixed_precision: - # model optimizer step in mixed precision mode - self.scaler.scale(loss_dict["loss"]).backward() - self.scaler.unscale_(self.optimizer) - grad_norm, _ = check_update(self.model, self.config.grad_clip, ignore_stopnet=True) - self.scaler.step(self.optimizer) - self.scaler.update() - else: - # main model optimizer step - loss_dict["loss"].backward() - grad_norm, _ = check_update(self.model, self.config.grad_clip, ignore_stopnet=True) - self.optimizer.step() - - step_time = time.time() - step_start_time - - # setup lr - if self.config.lr_scheduler: - self.scheduler.step() - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - self.keep_avg_train.update_values(update_train_values) - - # print training progress - current_lr = self.optimizer.param_groups[0]["lr"] - if self.total_steps_done % self.config.print_step == 0: - log_dict = { - "max_spec_length": [batch["max_spec_length"], 1], # value, precision - "max_text_length": [batch["max_text_length"], 1], - "step_time": [step_time, 4], - "loader_time": [loader_time, 2], - "current_lr": current_lr, - } - self.c_logger.print_train_step( - batch_n_steps, step, self.total_steps_done, log_dict, loss_dict, self.keep_avg_train.avg_values - ) - - if self.args.rank == 0: - # Plot Training Iter Stats - # reduce TB load - if self.total_steps_done % self.config.tb_plot_step == 0: - iter_stats = { - "lr": current_lr, - "grad_norm": grad_norm, - "step_time": step_time, - } - iter_stats.update(loss_dict) - self.tb_logger.tb_train_step_stats(self.total_steps_done, iter_stats) - - if self.total_steps_done % self.config.save_step == 0: - if self.config.checkpoint: - # save model - save_checkpoint( - self.model, - self.optimizer, - self.total_steps_done, - self.epochs_done, - self.config.r, - self.output_path, - model_loss=loss_dict["loss"], - characters=self.model_characters, - scaler=self.scaler.state_dict() if self.config.mixed_precision else None, - ) - # training visualizations - if hasattr(self.model, "module"): - figures, audios = self.model.module.train_log(self.ap, batch, outputs) - else: - figures, audios = self.model.train_log(self.ap, batch, outputs) - self.tb_logger.tb_train_figures(self.total_steps_done, figures) - self.tb_logger.tb_train_audios(self.total_steps_done, {"TrainAudio": audios}, self.ap.sample_rate) - self.total_steps_done += 1 - self.on_train_step_end() - return outputs, loss_dict + pass + @abstractmethod def train_epoch(self) -> None: - self.model.train() - epoch_start_time = time.time() - if self.use_cuda: - batch_num_steps = int(len(self.train_loader.dataset) / (self.config.batch_size * self.num_gpus)) - else: - batch_num_steps = int(len(self.train_loader.dataset) / self.config.batch_size) - self.c_logger.print_train_start() - loader_start_time = time.time() - for cur_step, batch in enumerate(self.train_loader): - _, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time) - epoch_time = time.time() - epoch_start_time - # Plot self.epochs_done Stats - if self.args.rank == 0: - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(self.keep_avg_train.avg_values) - self.tb_logger.tb_train_epoch_stats(self.total_steps_done, epoch_stats) - if self.config.tb_model_param_stats: - self.tb_logger.tb_model_weights(self.model, self.total_steps_done) + pass + @abstractmethod def _eval_step(self, batch: Dict) -> Tuple[Dict, Dict]: - if hasattr(self.model, "module"): - return self.model.module.eval_step(batch, self.criterion) - return self.model.eval_step(batch, self.criterion) + pass + @abstractmethod def eval_step(self, batch: Dict, step: int) -> Tuple[Dict, Dict]: - with torch.no_grad(): - step_start_time = time.time() - - with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): - outputs, loss_dict = self._eval_step(batch) - - step_time = time.time() - step_start_time - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_eval_values = dict() - for key, value in loss_dict.items(): - update_eval_values["avg_" + key] = value - update_eval_values["avg_step_time"] = step_time - self.keep_avg_eval.update_values(update_eval_values) - - if self.config.print_eval: - self.c_logger.print_eval_step(step, loss_dict, self.keep_avg_eval.avg_values) - return outputs, loss_dict + pass + @abstractmethod def eval_epoch(self) -> None: - self.model.eval() - self.c_logger.print_eval_start() - loader_start_time = time.time() - batch = None - for cur_step, batch in enumerate(self.eval_loader): - # format data - batch = self.format_batch(batch) - loader_time = time.time() - loader_start_time - self.keep_avg_eval.update_values({"avg_loader_time": loader_time}) - outputs, _ = self.eval_step(batch, cur_step) - # Plot epoch stats and samples from the last batch. - if self.args.rank == 0: - if hasattr(self.model, "module"): - figures, eval_audios = self.model.module.eval_log(self.ap, batch, outputs) - else: - figures, eval_audios = self.model.eval_log(self.ap, batch, outputs) - self.tb_logger.tb_eval_figures(self.total_steps_done, figures) - self.tb_logger.tb_eval_audios(self.total_steps_done, {"EvalAudio": eval_audios}, self.ap.sample_rate) + pass - def test_run( - self, - ) -> None: - print(" | > Synthesizing test sentences.") - test_audios = {} - test_figures = {} - test_sentences = self.config.test_sentences - aux_inputs = self._get_aux_inputs() - for idx, sen in enumerate(test_sentences): - wav, alignment, model_outputs, _ = synthesis( - self.model, - sen, - self.config, - self.use_cuda, - self.ap, - speaker_id=aux_inputs["speaker_id"], - d_vector=aux_inputs["d_vector"], - style_wav=aux_inputs["style_wav"], - enable_eos_bos_chars=self.config.enable_eos_bos_chars, - use_griffin_lim=True, - do_trim_silence=False, - ).values() - - file_path = os.path.join(self.output_audio_path, str(self.total_steps_done)) - os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) - self.ap.save_wav(wav, file_path) - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-prediction".format(idx)] = plot_spectrogram(model_outputs, self.ap, output_fig=False) - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment, output_fig=False) - - self.tb_logger.tb_test_audios(self.total_steps_done, test_audios, self.config.audio["sample_rate"]) - self.tb_logger.tb_test_figures(self.total_steps_done, test_figures) - - def _get_aux_inputs(self) -> Dict: - # setup speaker_id - speaker_id = 0 if self.config.use_speaker_embedding else None - # setup d_vector - d_vector = ( - self.speaker_manager.get_d_vectors_by_speaker(self.speaker_manager.speaker_names[0]) - if self.config.use_external_speaker_embedding_file and self.config.use_speaker_embedding - else None - ) - # setup style_mel - if self.config.has("gst_style_input"): - style_wav = self.config.gst_style_input - else: - style_wav = None - if style_wav is None and "use_gst" in self.config and self.config.use_gst: - # inicialize GST with zero dict. - style_wav = {} - print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") - for i in range(self.config.gst["gst_num_style_tokens"]): - style_wav[str(i)] = 0 - aux_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector} - return aux_inputs + @abstractmethod + def test_run(self) -> None: + pass + @abstractmethod def fit(self) -> None: - if self.restore_step != 0 or self.args.best_path: - print(" > Restoring best loss from " f"{os.path.basename(self.args.best_path)} ...") - self.best_loss = torch.load(self.args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {self.best_loss}.") - - # define data loaders - self.train_loader = self.get_train_dataloader( - self.config.r, - self.ap, - self.data_train, - verbose=True, - speaker_ids=self.speaker_manager.speaker_ids, - d_vectors=self.speaker_manager.d_vectors, - ) - self.eval_loader = ( - self.get_eval_dataloder( - self.config.r, - self.ap, - self.data_train, - verbose=True, - speaker_ids=self.speaker_manager.speaker_ids, - d_vectors=self.speaker_manager.d_vectors, - ) - if self.config.run_eval - else None - ) - - self.total_steps_done = self.restore_step - - for epoch in range(0, self.config.epochs): - self.on_epoch_start() - self.keep_avg_train = KeepAverage() - self.keep_avg_eval = KeepAverage() if self.config.run_eval else None - self.epochs_done = epoch - self.c_logger.print_epoch_start(epoch, self.config.epochs) - self.train_epoch() - if self.config.run_eval: - self.eval_epoch() - if epoch >= self.config.test_delay_epochs and self.args.rank < 0: - self.test_run() - self.c_logger.print_epoch_end( - epoch, self.keep_avg_eval.avg_values if self.config.run_eval else self.keep_avg_train.avg_values - ) - self.save_best_model() - self.on_epoch_end() + pass + @abstractmethod def save_best_model(self) -> None: - self.best_loss = save_best_model( - self.keep_avg_eval["avg_loss"] if self.keep_avg_eval else self.keep_avg_train["avg_loss"], - self.best_loss, - self.model, - self.optimizer, - self.total_steps_done, - self.epochs_done, - self.config.r, - self.output_path, - self.model_characters, - keep_all_best=self.config.keep_all_best, - keep_after=self.config.keep_after, - scaler=self.scaler.state_dict() if self.config.mixed_precision else None, - ) + pass - @staticmethod - def _setup_logger_config(log_file: str) -> None: - logging.basicConfig( - level=logging.INFO, format="", handlers=[logging.FileHandler(log_file), logging.StreamHandler()] - ) + @abstractmethod + def on_epoch_start(self) -> None: + pass - def on_epoch_start(self) -> None: # pylint: disable=no-self-use - if hasattr(self.model, "on_epoch_start"): - self.model.on_epoch_start(self) + @abstractmethod + def on_epoch_end(self) -> None: + pass - if hasattr(self.criterion, "on_epoch_start"): - self.criterion.on_epoch_start(self) + @abstractmethod + def on_train_step_start(self) -> None: + pass - if hasattr(self.optimizer, "on_epoch_start"): - self.optimizer.on_epoch_start(self) - - def on_epoch_end(self) -> None: # pylint: disable=no-self-use - if hasattr(self.model, "on_epoch_end"): - self.model.on_epoch_end(self) - - if hasattr(self.criterion, "on_epoch_end"): - self.criterion.on_epoch_end(self) - - if hasattr(self.optimizer, "on_epoch_end"): - self.optimizer.on_epoch_end(self) - - def on_train_step_start(self) -> None: # pylint: disable=no-self-use - if hasattr(self.model, "on_train_step_start"): - self.model.on_train_step_start(self) - - if hasattr(self.criterion, "on_train_step_start"): - self.criterion.on_train_step_start(self) - - if hasattr(self.optimizer, "on_train_step_start"): - self.optimizer.on_train_step_start(self) - - def on_train_step_end(self) -> None: # pylint: disable=no-self-use - if hasattr(self.model, "on_train_step_end"): - self.model.on_train_step_end(self) - - if hasattr(self.criterion, "on_train_step_end"): - self.criterion.on_train_step_end(self) - - if hasattr(self.optimizer, "on_train_step_end"): - self.optimizer.on_train_step_end(self) + @abstractmethod + def on_train_step_end(self) -> None: + pass diff --git a/TTS/tts/trainer_tts.py b/TTS/tts/trainer_tts.py new file mode 100644 index 00000000..9d060498 --- /dev/null +++ b/TTS/tts/trainer_tts.py @@ -0,0 +1,709 @@ +# -*- coding: utf-8 -*- + +import importlib +import logging +import os +import time +from argparse import Namespace +from typing import Dict, List, Tuple, Union + +import torch +from coqpit import Coqpit + +# DISTRIBUTED +from torch import nn +from torch.nn.parallel import DistributedDataParallel as DDP_th +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler + +from TTS.trainer import TrainerAbstract +from TTS.tts.datasets import TTSDataset, load_meta_data +from TTS.tts.layers import setup_loss +from TTS.tts.models import setup_model +from TTS.tts.utils.io import save_best_model, save_checkpoint +from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager +from TTS.tts.utils.synthesis import synthesis +from TTS.tts.utils.text.symbols import make_symbols +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor +from TTS.utils.distribute import init_distributed +from TTS.utils.generic_utils import KeepAverage, count_parameters, set_init_dict, to_cuda +from TTS.utils.logging import ConsoleLogger, TensorboardLogger +from TTS.utils.training import check_update, setup_torch_training_env + + +# pylint: disable=import-outside-toplevel, too-many-public-methods + +class TrainerTTS(TrainerAbstract): + use_cuda, num_gpus = setup_torch_training_env(True, False) + + def __init__( + self, + args: Union[Coqpit, Namespace], + config: Coqpit, + c_logger: ConsoleLogger = None, + tb_logger: TensorboardLogger = None, + model: nn.Module = None, + output_path: str = None, + ) -> None: + self.args = args + self.config = config + self.c_logger = ConsoleLogger() if c_logger is None else c_logger + if tb_logger is None: + self.tb_logger = TensorboardLogger(output_path, model_name=config.model) + self.tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) + else: + self.tb_logger = tb_logger + self.output_path = output_path + + self.total_steps_done = 0 + self.epochs_done = 0 + self.restore_step = 0 + self.best_loss = float("inf") + self.train_loader = None + self.eval_loader = None + self.output_audio_path = os.path.join(output_path, "test_audios") + + self.keep_avg_train = None + self.keep_avg_eval = None + + log_file = os.path.join(self.output_path, f"trainer_{args.rank}_log.txt") + self._setup_logger_config(log_file) + + # model, audio processor, datasets, loss + # init audio processor + self.ap = AudioProcessor(**self.config.audio.to_dict()) + + # init character processor + self.model_characters = self.get_character_processor(self.config) + + # load dataset samples + self.data_train, self.data_eval = load_meta_data(self.config.datasets) + + # default speaker manager + self.speaker_manager = self.get_speaker_manager(self.config, args.restore_path, output_path, self.data_train) + + # init TTS model + if model is not None: + self.model = model + else: + self.model = self.get_model( + len(self.model_characters), + self.speaker_manager.num_speakers, + self.config, + self.speaker_manager.d_vector_dim if self.speaker_manager.d_vectors else None, + ) + + # setup criterion + self.criterion = self.get_criterion(self.config) + + # DISTRUBUTED + if self.num_gpus > 1: + init_distributed( + args.rank, + self.num_gpus, + args.group_id, + self.config.distributed_backend, + self.config.distributed_url, + ) + + if self.use_cuda: + self.model.cuda() + self.criterion.cuda() + + # scalers for mixed precision training + self.scaler = torch.cuda.amp.GradScaler() if self.config.mixed_precision and self.use_cuda else None + + # setup optimizer + self.optimizer = self.get_optimizer(self.model, self.config) + + if self.args.restore_path: + self.model, self.optimizer, self.scaler, self.restore_step = self.restore_model( + self.config, args.restore_path, self.model, self.optimizer, self.scaler + ) + + # setup scheduler + self.scheduler = self.get_scheduler(self.config, self.optimizer) + + # DISTRUBUTED + if self.num_gpus > 1: + self.model = DDP_th(self.model, device_ids=[args.rank], output_device=args.rank) + + # count model size + num_params = count_parameters(self.model) + print("\n > Model has {} parameters".format(num_params)) + + @staticmethod + def get_model(num_chars: int, num_speakers: int, config: Coqpit, d_vector_dim: int) -> nn.Module: + model = setup_model(num_chars, num_speakers, config, d_vector_dim) + return model + + @staticmethod + def get_optimizer(model: nn.Module, config: Coqpit) -> torch.optim.Optimizer: + optimizer_name = config.optimizer + optimizer_params = config.optimizer_params + if optimizer_name.lower() == "radam": + module = importlib.import_module("TTS.utils.radam") + optimizer = getattr(module, "RAdam") + else: + optimizer = getattr(torch.optim, optimizer_name) + return optimizer(model.parameters(), lr=config.lr, **optimizer_params) + + @staticmethod + def get_character_processor(config: Coqpit) -> str: + # setup custom characters if set in config file. + # TODO: implement CharacterProcessor + if config.characters is not None: + symbols, phonemes = make_symbols(**config.characters.to_dict()) + else: + from TTS.tts.utils.text.symbols import phonemes, symbols + model_characters = phonemes if config.use_phonemes else symbols + return model_characters + + @staticmethod + def get_speaker_manager( + config: Coqpit, restore_path: str = "", out_path: str = "", data_train: List = None + ) -> SpeakerManager: + speaker_manager = get_speaker_manager(config, restore_path, data_train, out_path) + return speaker_manager + + @staticmethod + def get_scheduler( + config: Coqpit, optimizer: torch.optim.Optimizer + ) -> torch.optim.lr_scheduler._LRScheduler: # pylint: disable=protected-access + lr_scheduler = config.lr_scheduler + lr_scheduler_params = config.lr_scheduler_params + if lr_scheduler is None: + return None + if lr_scheduler.lower() == "noamlr": + from TTS.utils.training import NoamLR + + scheduler = NoamLR + else: + scheduler = getattr(torch.optim, lr_scheduler) + return scheduler(optimizer, **lr_scheduler_params) + + @staticmethod + def get_criterion(config: Coqpit) -> nn.Module: + return setup_loss(config) + + def restore_model( + self, + config: Coqpit, + restore_path: str, + model: nn.Module, + optimizer: torch.optim.Optimizer, + scaler: torch.cuda.amp.GradScaler = None, + ) -> Tuple[nn.Module, torch.optim.Optimizer, torch.cuda.amp.GradScaler, int]: + print(" > Restoring from %s ..." % os.path.basename(restore_path)) + checkpoint = torch.load(restore_path) + try: + print(" > Restoring Model...") + model.load_state_dict(checkpoint["model"]) + print(" > Restoring Optimizer...") + optimizer.load_state_dict(checkpoint["optimizer"]) + if "scaler" in checkpoint and config.mixed_precision: + print(" > Restoring AMP Scaler...") + scaler.load_state_dict(checkpoint["scaler"]) + except (KeyError, RuntimeError): + print(" > Partial model initialization...") + model_dict = model.state_dict() + model_dict = set_init_dict(model_dict, checkpoint["model"], config) + model.load_state_dict(model_dict) + del model_dict + + for group in optimizer.param_groups: + group["lr"] = self.config.lr + print( + " > Model restored from step %d" % checkpoint["step"], + ) + restore_step = checkpoint["step"] + return model, optimizer, scaler, restore_step + + def _get_loader( + self, + r: int, + ap: AudioProcessor, + is_eval: bool, + data_items: List, + verbose: bool, + speaker_ids: Union[Dict, List], + d_vectors: Union[Dict, List], + ) -> DataLoader: + if is_eval and not self.config.run_eval: + loader = None + else: + dataset = TTSDataset( + outputs_per_step=r, + text_cleaner=self.config.text_cleaner, + compute_linear_spec=self.config.model.lower() == "tacotron", + meta_data=data_items, + ap=ap, + tp=self.config.characters, + add_blank=self.config["add_blank"], + batch_group_size=0 if is_eval else self.config.batch_group_size * self.config.batch_size, + min_seq_len=self.config.min_seq_len, + max_seq_len=self.config.max_seq_len, + phoneme_cache_path=self.config.phoneme_cache_path, + use_phonemes=self.config.use_phonemes, + phoneme_language=self.config.phoneme_language, + enable_eos_bos=self.config.enable_eos_bos_chars, + use_noise_augment=not is_eval, + verbose=verbose, + speaker_id_mapping=speaker_ids if self.config.use_speaker_embedding else None, + d_vector_mapping=d_vectors + if self.config.use_speaker_embedding and self.config.use_external_speaker_embedding_file + else None, + ) + + if self.config.use_phonemes and self.config.compute_input_seq_cache: + # precompute phonemes to have a better estimate of sequence lengths. + dataset.compute_input_seq(self.config.num_loader_workers) + dataset.sort_items() + + sampler = DistributedSampler(dataset) if self.num_gpus > 1 else None + loader = DataLoader( + dataset, + batch_size=self.config.eval_batch_size if is_eval else self.config.batch_size, + shuffle=False, + collate_fn=dataset.collate_fn, + drop_last=False, + sampler=sampler, + num_workers=self.config.num_val_loader_workers if is_eval else self.config.num_loader_workers, + pin_memory=False, + ) + return loader + + def get_train_dataloader( + self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_ids: Dict, d_vectors: Dict + ) -> DataLoader: + return self._get_loader(r, ap, False, data_items, verbose, speaker_ids, d_vectors) + + def get_eval_dataloder( + self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_ids: Dict, d_vectors: Dict + ) -> DataLoader: + return self._get_loader(r, ap, True, data_items, verbose, speaker_ids, d_vectors) + + def format_batch(self, batch: List) -> Dict: + # setup input batch + text_input = batch[0] + text_lengths = batch[1] + speaker_names = batch[2] + linear_input = batch[3] if self.config.model.lower() in ["tacotron"] else None + mel_input = batch[4] + mel_lengths = batch[5] + stop_targets = batch[6] + item_idx = batch[7] + d_vectors = batch[8] + speaker_ids = batch[9] + attn_mask = batch[10] + max_text_length = torch.max(text_lengths.float()) + max_spec_length = torch.max(mel_lengths.float()) + + # compute durations from attention masks + durations = None + if attn_mask is not None: + durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2]) + for idx, am in enumerate(attn_mask): + # compute raw durations + c_idxs = am[:, : text_lengths[idx], : mel_lengths[idx]].max(1)[1] + # c_idxs, counts = torch.unique_consecutive(c_idxs, return_counts=True) + c_idxs, counts = torch.unique(c_idxs, return_counts=True) + dur = torch.ones([text_lengths[idx]]).to(counts.dtype) + dur[c_idxs] = counts + # smooth the durations and set any 0 duration to 1 + # by cutting off from the largest duration indeces. + extra_frames = dur.sum() - mel_lengths[idx] + largest_idxs = torch.argsort(-dur)[:extra_frames] + dur[largest_idxs] -= 1 + assert ( + dur.sum() == mel_lengths[idx] + ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + durations[idx, : text_lengths[idx]] = dur + + # set stop targets view, we predict a single stop token per iteration. + stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // self.config.r, -1) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) + + # dispatch batch to GPU + if self.use_cuda: + text_input = to_cuda(text_input) + text_lengths = to_cuda(text_lengths) + mel_input = to_cuda(mel_input) + mel_lengths = to_cuda(mel_lengths) + linear_input = to_cuda(linear_input) if self.config.model.lower() in ["tacotron"] else None + stop_targets = to_cuda(stop_targets) + attn_mask = to_cuda(attn_mask) if attn_mask is not None else None + durations = to_cuda(durations) if attn_mask is not None else None + if speaker_ids is not None: + speaker_ids = to_cuda(speaker_ids) + if d_vectors is not None: + d_vectors = to_cuda(d_vectors) + + return { + "text_input": text_input, + "text_lengths": text_lengths, + "speaker_names": speaker_names, + "mel_input": mel_input, + "mel_lengths": mel_lengths, + "linear_input": linear_input, + "stop_targets": stop_targets, + "attn_mask": attn_mask, + "durations": durations, + "speaker_ids": speaker_ids, + "d_vectors": d_vectors, + "max_text_length": max_text_length, + "max_spec_length": max_spec_length, + "item_idx": item_idx, + } + + def _train_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: + if hasattr(self.model, "module"): + return self.model.module.train_step(batch, criterion) + return self.model.train_step(batch, criterion) + + def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_time: float) -> Tuple[Dict, Dict]: + self.on_train_step_start() + step_start_time = time.time() + + # format data + batch = self.format_batch(batch) + loader_time = time.time() - loader_start_time + + # zero-out optimizer + self.optimizer.zero_grad() + + with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): + outputs, loss_dict = self._train_step(batch, self.criterion) + + # check nan loss + if torch.isnan(loss_dict["loss"]).any(): + raise RuntimeError(f"Detected NaN loss at step {self.total_steps_done}.") + + # optimizer step + if self.config.mixed_precision: + # model optimizer step in mixed precision mode + self.scaler.scale(loss_dict["loss"]).backward() + self.scaler.unscale_(self.optimizer) + grad_norm, _ = check_update(self.model, self.config.grad_clip, ignore_stopnet=True) + self.scaler.step(self.optimizer) + self.scaler.update() + else: + # main model optimizer step + loss_dict["loss"].backward() + grad_norm, _ = check_update(self.model, self.config.grad_clip, ignore_stopnet=True) + self.optimizer.step() + + step_time = time.time() - step_start_time + + # setup lr + if self.config.lr_scheduler: + self.scheduler.step() + + # detach loss values + loss_dict_new = dict() + for key, value in loss_dict.items(): + if isinstance(value, (int, float)): + loss_dict_new[key] = value + else: + loss_dict_new[key] = value.item() + loss_dict = loss_dict_new + + # update avg stats + update_train_values = dict() + for key, value in loss_dict.items(): + update_train_values["avg_" + key] = value + update_train_values["avg_loader_time"] = loader_time + update_train_values["avg_step_time"] = step_time + self.keep_avg_train.update_values(update_train_values) + + # print training progress + current_lr = self.optimizer.param_groups[0]["lr"] + if self.total_steps_done % self.config.print_step == 0: + log_dict = { + "max_spec_length": [batch["max_spec_length"], 1], # value, precision + "max_text_length": [batch["max_text_length"], 1], + "step_time": [step_time, 4], + "loader_time": [loader_time, 2], + "current_lr": current_lr, + } + self.c_logger.print_train_step( + batch_n_steps, step, self.total_steps_done, log_dict, loss_dict, self.keep_avg_train.avg_values + ) + + if self.args.rank == 0: + # Plot Training Iter Stats + # reduce TB load + if self.total_steps_done % self.config.tb_plot_step == 0: + iter_stats = { + "lr": current_lr, + "grad_norm": grad_norm, + "step_time": step_time, + } + iter_stats.update(loss_dict) + self.tb_logger.tb_train_step_stats(self.total_steps_done, iter_stats) + + if self.total_steps_done % self.config.save_step == 0: + if self.config.checkpoint: + # save model + save_checkpoint( + self.model, + self.optimizer, + self.total_steps_done, + self.epochs_done, + self.config.r, + self.output_path, + model_loss=loss_dict["loss"], + characters=self.model_characters, + scaler=self.scaler.state_dict() if self.config.mixed_precision else None, + ) + # training visualizations + if hasattr(self.model, "module"): + figures, audios = self.model.module.train_log(self.ap, batch, outputs) + else: + figures, audios = self.model.train_log(self.ap, batch, outputs) + self.tb_logger.tb_train_figures(self.total_steps_done, figures) + self.tb_logger.tb_train_audios(self.total_steps_done, {"TrainAudio": audios}, self.ap.sample_rate) + self.total_steps_done += 1 + self.on_train_step_end() + return outputs, loss_dict + + def train_epoch(self) -> None: + self.model.train() + epoch_start_time = time.time() + if self.use_cuda: + batch_num_steps = int(len(self.train_loader.dataset) / (self.config.batch_size * self.num_gpus)) + else: + batch_num_steps = int(len(self.train_loader.dataset) / self.config.batch_size) + self.c_logger.print_train_start() + loader_start_time = time.time() + for cur_step, batch in enumerate(self.train_loader): + _, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time) + epoch_time = time.time() - epoch_start_time + # Plot self.epochs_done Stats + if self.args.rank == 0: + epoch_stats = {"epoch_time": epoch_time} + epoch_stats.update(self.keep_avg_train.avg_values) + self.tb_logger.tb_train_epoch_stats(self.total_steps_done, epoch_stats) + if self.config.tb_model_param_stats: + self.tb_logger.tb_model_weights(self.model, self.total_steps_done) + + def _eval_step(self, batch: Dict) -> Tuple[Dict, Dict]: + if hasattr(self.model, "module"): + return self.model.module.eval_step(batch, self.criterion) + return self.model.eval_step(batch, self.criterion) + + def eval_step(self, batch: Dict, step: int) -> Tuple[Dict, Dict]: + with torch.no_grad(): + step_start_time = time.time() + + with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): + outputs, loss_dict = self._eval_step(batch) + + step_time = time.time() - step_start_time + + # detach loss values + loss_dict_new = dict() + for key, value in loss_dict.items(): + if isinstance(value, (int, float)): + loss_dict_new[key] = value + else: + loss_dict_new[key] = value.item() + loss_dict = loss_dict_new + + # update avg stats + update_eval_values = dict() + for key, value in loss_dict.items(): + update_eval_values["avg_" + key] = value + update_eval_values["avg_step_time"] = step_time + self.keep_avg_eval.update_values(update_eval_values) + + if self.config.print_eval: + self.c_logger.print_eval_step(step, loss_dict, self.keep_avg_eval.avg_values) + return outputs, loss_dict + + def eval_epoch(self) -> None: + self.model.eval() + self.c_logger.print_eval_start() + loader_start_time = time.time() + batch = None + for cur_step, batch in enumerate(self.eval_loader): + # format data + batch = self.format_batch(batch) + loader_time = time.time() - loader_start_time + self.keep_avg_eval.update_values({"avg_loader_time": loader_time}) + outputs, _ = self.eval_step(batch, cur_step) + # Plot epoch stats and samples from the last batch. + if self.args.rank == 0: + if hasattr(self.model, "module"): + figures, eval_audios = self.model.module.eval_log(self.ap, batch, outputs) + else: + figures, eval_audios = self.model.eval_log(self.ap, batch, outputs) + self.tb_logger.tb_eval_figures(self.total_steps_done, figures) + self.tb_logger.tb_eval_audios(self.total_steps_done, {"EvalAudio": eval_audios}, self.ap.sample_rate) + + def test_run( + self, + ) -> None: + print(" | > Synthesizing test sentences.") + test_audios = {} + test_figures = {} + test_sentences = self.config.test_sentences + aux_inputs = self._get_aux_inputs() + for idx, sen in enumerate(test_sentences): + wav, alignment, model_outputs, _ = synthesis( + self.model, + sen, + self.config, + self.use_cuda, + self.ap, + speaker_id=aux_inputs["speaker_id"], + d_vector=aux_inputs["d_vector"], + style_wav=aux_inputs["style_wav"], + enable_eos_bos_chars=self.config.enable_eos_bos_chars, + use_griffin_lim=True, + do_trim_silence=False, + ).values() + + file_path = os.path.join(self.output_audio_path, str(self.total_steps_done)) + os.makedirs(file_path, exist_ok=True) + file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) + self.ap.save_wav(wav, file_path) + test_audios["{}-audio".format(idx)] = wav + test_figures["{}-prediction".format(idx)] = plot_spectrogram(model_outputs, self.ap, output_fig=False) + test_figures["{}-alignment".format(idx)] = plot_alignment(alignment, output_fig=False) + + self.tb_logger.tb_test_audios(self.total_steps_done, test_audios, self.config.audio["sample_rate"]) + self.tb_logger.tb_test_figures(self.total_steps_done, test_figures) + + def _get_aux_inputs(self) -> Dict: + # setup speaker_id + speaker_id = 0 if self.config.use_speaker_embedding else None + # setup d_vector + d_vector = ( + self.speaker_manager.get_d_vectors_by_speaker(self.speaker_manager.speaker_names[0]) + if self.config.use_external_speaker_embedding_file and self.config.use_speaker_embedding + else None + ) + # setup style_mel + if self.config.has("gst_style_input"): + style_wav = self.config.gst_style_input + else: + style_wav = None + if style_wav is None and "use_gst" in self.config and self.config.use_gst: + # inicialize GST with zero dict. + style_wav = {} + print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") + for i in range(self.config.gst["gst_num_style_tokens"]): + style_wav[str(i)] = 0 + aux_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector} + return aux_inputs + + def fit(self) -> None: + if self.restore_step != 0 or self.args.best_path: + print(" > Restoring best loss from " f"{os.path.basename(self.args.best_path)} ...") + self.best_loss = torch.load(self.args.best_path, map_location="cpu")["model_loss"] + print(f" > Starting with loaded last best loss {self.best_loss}.") + + # define data loaders + self.train_loader = self.get_train_dataloader( + self.config.r, + self.ap, + self.data_train, + verbose=True, + speaker_ids=self.speaker_manager.speaker_ids, + d_vectors=self.speaker_manager.d_vectors, + ) + self.eval_loader = ( + self.get_eval_dataloder( + self.config.r, + self.ap, + self.data_train, + verbose=True, + speaker_ids=self.speaker_manager.speaker_ids, + d_vectors=self.speaker_manager.d_vectors, + ) + if self.config.run_eval + else None + ) + + self.total_steps_done = self.restore_step + + for epoch in range(0, self.config.epochs): + self.on_epoch_start() + self.keep_avg_train = KeepAverage() + self.keep_avg_eval = KeepAverage() if self.config.run_eval else None + self.epochs_done = epoch + self.c_logger.print_epoch_start(epoch, self.config.epochs) + self.train_epoch() + if self.config.run_eval: + self.eval_epoch() + if epoch >= self.config.test_delay_epochs and self.args.rank < 0: + self.test_run() + self.c_logger.print_epoch_end( + epoch, self.keep_avg_eval.avg_values if self.config.run_eval else self.keep_avg_train.avg_values + ) + self.save_best_model() + self.on_epoch_end() + + def save_best_model(self) -> None: + self.best_loss = save_best_model( + self.keep_avg_eval["avg_loss"] if self.keep_avg_eval else self.keep_avg_train["avg_loss"], + self.best_loss, + self.model, + self.optimizer, + self.total_steps_done, + self.epochs_done, + self.config.r, + self.output_path, + self.model_characters, + keep_all_best=self.config.keep_all_best, + keep_after=self.config.keep_after, + scaler=self.scaler.state_dict() if self.config.mixed_precision else None, + ) + + @staticmethod + def _setup_logger_config(log_file: str) -> None: + logging.basicConfig( + level=logging.INFO, format="", handlers=[logging.FileHandler(log_file), logging.StreamHandler()] + ) + + def on_epoch_start(self) -> None: # pylint: disable=no-self-use + if hasattr(self.model, "on_epoch_start"): + self.model.on_epoch_start(self) + + if hasattr(self.criterion, "on_epoch_start"): + self.criterion.on_epoch_start(self) + + if hasattr(self.optimizer, "on_epoch_start"): + self.optimizer.on_epoch_start(self) + + def on_epoch_end(self) -> None: # pylint: disable=no-self-use + if hasattr(self.model, "on_epoch_end"): + self.model.on_epoch_end(self) + + if hasattr(self.criterion, "on_epoch_end"): + self.criterion.on_epoch_end(self) + + if hasattr(self.optimizer, "on_epoch_end"): + self.optimizer.on_epoch_end(self) + + def on_train_step_start(self) -> None: # pylint: disable=no-self-use + if hasattr(self.model, "on_train_step_start"): + self.model.on_train_step_start(self) + + if hasattr(self.criterion, "on_train_step_start"): + self.criterion.on_train_step_start(self) + + if hasattr(self.optimizer, "on_train_step_start"): + self.optimizer.on_train_step_start(self) + + def on_train_step_end(self) -> None: # pylint: disable=no-self-use + if hasattr(self.model, "on_train_step_end"): + self.model.on_train_step_end(self) + + if hasattr(self.criterion, "on_train_step_end"): + self.criterion.on_train_step_end(self) + + if hasattr(self.optimizer, "on_train_step_end"): + self.optimizer.on_train_step_end(self) From b934665fc0ed5a1c079110a1d7b114f09b73a3f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 8 Jun 2021 17:34:19 +0200 Subject: [PATCH 078/258] fix calculation of `loader_start_time` --- TTS/tts/trainer_tts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/trainer_tts.py b/TTS/tts/trainer_tts.py index 9d060498..6c900120 100644 --- a/TTS/tts/trainer_tts.py +++ b/TTS/tts/trainer_tts.py @@ -476,8 +476,8 @@ class TrainerTTS(TrainerAbstract): else: batch_num_steps = int(len(self.train_loader.dataset) / self.config.batch_size) self.c_logger.print_train_start() - loader_start_time = time.time() for cur_step, batch in enumerate(self.train_loader): + loader_start_time = time.time() _, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time) epoch_time = time.time() - epoch_start_time # Plot self.epochs_done Stats From 20c4a8c8e1b83c44753887477ac30cc194e27d97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 7 Jun 2021 19:22:44 +0200 Subject: [PATCH 079/258] `tts` model abstraction with `TTSModel` --- TTS/tts/models/abstract_tts.py | 134 ++++++++++++++++++++++++++++++++ TTS/tts/models/align_tts.py | 3 +- TTS/tts/models/glow_tts.py | 3 +- TTS/tts/models/speedy_speech.py | 3 +- 4 files changed, 140 insertions(+), 3 deletions(-) create mode 100644 TTS/tts/models/abstract_tts.py diff --git a/TTS/tts/models/abstract_tts.py b/TTS/tts/models/abstract_tts.py new file mode 100644 index 00000000..9132f7eb --- /dev/null +++ b/TTS/tts/models/abstract_tts.py @@ -0,0 +1,134 @@ +from coqpit import Coqpit +from abc import ABC, abstractmethod +from typing import Dict, Tuple + +import numpy as np +import torch +from torch import nn + +from TTS.utils.audio import AudioProcessor + +# pylint: skip-file + + +class TTSModel(nn.Module, ABC): + """Abstract TTS class. Every new `tts` model must inherit this. + + Notes on input/output tensor shapes: + Any input or output tensor of the model must be shaped as + + - 3D tensors `batch x time x channels` + - 2D tensors `batch x channels` + - 1D tensors `batch x 1` + """ + + @abstractmethod + def forward(self, text: torch.Tensor, aux_input={}, **kwargs) -> Dict: + """Forward pass for the model mainly used in training. + + You can be flexible here and use different number of arguments and argument names since it is mostly used by + `train_step()` in training whitout exposing it to the out of the class. + + Args: + text (torch.Tensor): Input text character sequence ids. + aux_input (Dict): Auxiliary model inputs like embeddings, durations or any other sorts of inputs. + for the model. + + Returns: + Dict: model outputs. This must include an item keyed `model_outputs` as the final artifact of the model. + """ + outputs_dict = {"model_outputs": None} + ... + return outputs_dict + + @abstractmethod + def inference(self, text: torch.Tensor, aux_input={}) -> Dict: + """Forward pass for inference. + + After the model is trained this is the only function that connects the model the out world. + + This function must only take a `text` input and a dictionary that has all the other model specific inputs. + We don't use `*kwargs` since it is problematic with the TorchScript API. + + Args: + text (torch.Tensor): [description] + aux_input (Dict): Auxiliary inputs like speaker embeddings, durations etc. + + Returns: + Dict: [description] + """ + outputs_dict = {"model_outputs": None} + ... + return outputs_dict + + @abstractmethod + def train_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: + """Perform a single training step. Run the model forward pass and compute losses. + + Args: + batch (Dict): Input tensors. + criterion (nn.Module): Loss layer designed for the model. + + Returns: + Tuple[Dict, Dict]: Model ouputs and computed losses. + """ + outputs_dict = {} + loss_dict = {} # this returns from the criterion + ... + return outputs_dict, loss_dict + + @abstractmethod + def train_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + """Create visualizations and waveform examples for training. + + For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to + be projected onto Tensorboard. + + Args: + ap (AudioProcessor): audio processor used at training. + batch (Dict): Model inputs used at the previous training step. + outputs (Dict): Model outputs generated at the previoud training step. + + Returns: + Tuple[Dict, np.ndarray]: training plots and output waveform. + """ + figures_dict = {} + output_wav = np.array() + ... + return figures_dict, output_wav + + @abstractmethod + def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: + """Perform a single evaluation step. Run the model forward pass and compute losses. In most cases, you can + call `train_step()` with no changes. + + Args: + batch (Dict): Input tensors. + criterion (nn.Module): Loss layer designed for the model. + + Returns: + Tuple[Dict, Dict]: Model ouputs and computed losses. + """ + outputs_dict = {} + loss_dict = {} # this returns from the criterion + ... + return outputs_dict, loss_dict + + @abstractmethod + def eval_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + """The same as `train_log()`""" + figures_dict = {} + output_wav = np.array() + ... + return figures_dict, output_wav + + @abstractmethod + def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False) -> None: + """Load a checkpoint and get ready for training or inference. + + Args: + config (Coqpit): Model configuration. + checkpoint_path (str): Path to the model checkpoint file. + eval (bool, optional): If true, init model for inference else for training. Defaults to False. + """ + ... diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 6c268a43..75fb50de 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -7,13 +7,14 @@ from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path +from TTS.tts.models.abstract_tts import TTSModel from TTS.tts.utils.data import sequence_mask from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor -class AlignTTS(nn.Module): +class AlignTTS(TTSModel): """AlignTTS with modified duration predictor. https://arxiv.org/pdf/2003.01950.pdf diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index e61b80c2..a30eadb4 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -7,13 +7,14 @@ from torch.nn import functional as F from TTS.tts.layers.glow_tts.decoder import Decoder from TTS.tts.layers.glow_tts.encoder import Encoder from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path +from TTS.tts.models.abstract_tts import TTSModel from TTS.tts.utils.data import sequence_mask from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor -class GlowTTS(nn.Module): +class GlowTTS(TTSModel): """Glow TTS models from https://arxiv.org/abs/2005.11129 Args: diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index d4a90a2e..44a47722 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -6,13 +6,14 @@ from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path +from TTS.tts.models.abstract_tts import TTSModel from TTS.tts.utils.data import sequence_mask from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor -class SpeedySpeech(nn.Module): +class SpeedySpeech(TTSModel): """Speedy Speech model https://arxiv.org/abs/2008.03802 From 6b907554f8f10fbd26bf4af7c3bd69fbcd184c9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:22:05 +0200 Subject: [PATCH 080/258] Implement unified trainer --- TTS/bin/train_encoder.py | 2 +- TTS/bin/train_tts.py | 24 +- TTS/bin/train_vocoder.py | 27 + TTS/bin/train_vocoder_gan.py | 638 ------------------ TTS/bin/train_vocoder_wavegrad.py | 431 ------------ TTS/bin/train_vocoder_wavernn.py | 431 ------------ TTS/trainer.py | 999 ++++++++++++++++++++++++++-- TTS/tts/models/tacotron_abstract.py | 245 ------- TTS/tts/trainer_tts.py | 709 -------------------- TTS/utils/arguments.py | 182 ----- TTS/utils/callbacks.py | 75 +++ TTS/utils/distribute.py | 45 -- TTS/utils/trainer_utils.py | 65 ++ TTS/utils/training.py | 79 +-- 14 files changed, 1128 insertions(+), 2824 deletions(-) create mode 100644 TTS/bin/train_vocoder.py delete mode 100755 TTS/bin/train_vocoder_gan.py delete mode 100644 TTS/bin/train_vocoder_wavegrad.py delete mode 100644 TTS/bin/train_vocoder_wavernn.py delete mode 100644 TTS/tts/models/tacotron_abstract.py delete mode 100644 TTS/tts/trainer_tts.py delete mode 100644 TTS/utils/arguments.py create mode 100644 TTS/utils/callbacks.py create mode 100644 TTS/utils/trainer_utils.py diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 6e4a9b32..38902a18 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -13,8 +13,8 @@ from TTS.speaker_encoder.dataset import SpeakerEncoderDataset from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_model from TTS.speaker_encoder.utils.visual import plot_embeddings +from TTS.trainer import init_training from TTS.tts.datasets import load_meta_data -from TTS.utils.arguments import init_training from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import count_parameters, remove_experiment_folder, set_init_dict from TTS.utils.radam import RAdam diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 06765906..c491700d 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -1,27 +1,13 @@ -import os import sys -import traceback -from TTS.tts.trainer_tts import TrainerTTS -from TTS.utils.arguments import init_training -from TTS.utils.generic_utils import remove_experiment_folder +from TTS.trainer import Trainer, init_training def main(): - try: - args, config, output_path, _, c_logger, tb_logger = init_training(sys.argv) - trainer = TrainerTTS(args, config, c_logger, tb_logger, output_path=output_path) - trainer.fit() - except KeyboardInterrupt: - remove_experiment_folder(output_path) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(output_path) - traceback.print_exc() - sys.exit(1) + """Run 🐸TTS trainer from terminal. This is also necessary to run DDP training by ```distribute.py```""" + args, config, output_path, _, c_logger, tb_logger = init_training(sys.argv) + trainer = Trainer(args, config, output_path, c_logger, tb_logger, cudnn_benchmark=False) + trainer.fit() if __name__ == "__main__": diff --git a/TTS/bin/train_vocoder.py b/TTS/bin/train_vocoder.py new file mode 100644 index 00000000..868aae2e --- /dev/null +++ b/TTS/bin/train_vocoder.py @@ -0,0 +1,27 @@ +import os +import sys +import traceback + +from TTS.trainer import Trainer, init_training +from TTS.utils.generic_utils import remove_experiment_folder + + +def main(): + try: + args, config, output_path, _, c_logger, tb_logger = init_training(sys.argv) + trainer = Trainer(args, config, output_path, c_logger, tb_logger) + trainer.fit() + except KeyboardInterrupt: + remove_experiment_folder(output_path) + try: + sys.exit(0) + except SystemExit: + os._exit(0) # pylint: disable=protected-access + except Exception: # pylint: disable=broad-except + remove_experiment_folder(output_path) + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/TTS/bin/train_vocoder_gan.py b/TTS/bin/train_vocoder_gan.py deleted file mode 100755 index ea317ef6..00000000 --- a/TTS/bin/train_vocoder_gan.py +++ /dev/null @@ -1,638 +0,0 @@ -#!/usr/bin/env python3 -# TODO: mixed precision training -"""Trains GAN based vocoder model.""" - -import itertools -import os -import sys -import time -import traceback -from inspect import signature - -import torch - -# DISTRIBUTED -from torch.nn.parallel import DistributedDataParallel as DDP_th -from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler - -from TTS.utils.arguments import init_training -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import init_distributed -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.training import setup_torch_training_env -from TTS.vocoder.datasets.gan_dataset import GANDataset -from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data -from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss -from TTS.vocoder.utils.generic_utils import plot_results, setup_discriminator, setup_generator -from TTS.vocoder.utils.io import save_best_model, save_checkpoint - -use_cuda, num_gpus = setup_torch_training_env(True, True) - - -def setup_loader(ap, is_val=False, verbose=False): - loader = None - if not is_val or c.run_eval: - dataset = GANDataset( - ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad_short=c.pad_short, - conv_pad=c.conv_pad, - return_pairs=c.diff_samples_for_G_and_D if "diff_samples_for_G_and_D" in c else False, - is_training=not is_val, - return_segments=not is_val, - use_noise_augment=c.use_noise_augment, - use_cache=c.use_cache, - verbose=verbose, - ) - dataset.shuffle_mapping() - sampler = DistributedSampler(dataset, shuffle=True) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=1 if is_val else c.batch_size, - shuffle=num_gpus == 0, - drop_last=False, - sampler=sampler, - num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers, - pin_memory=False, - ) - return loader - - -def format_data(data): - if isinstance(data[0], list): - x_G, y_G = data[0] - x_D, y_D = data[1] - if use_cuda: - x_G = x_G.cuda(non_blocking=True) - y_G = y_G.cuda(non_blocking=True) - x_D = x_D.cuda(non_blocking=True) - y_D = y_D.cuda(non_blocking=True) - return x_G, y_G, x_D, y_D - x, y = data - if use_cuda: - x = x.cuda(non_blocking=True) - y = y.cuda(non_blocking=True) - return x, y, None, None - - -def train( - model_G, - criterion_G, - optimizer_G, - model_D, - criterion_D, - optimizer_D, - scheduler_G, - scheduler_D, - ap, - global_step, - epoch, -): - data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) - model_G.train() - model_D.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / c.batch_size) - end_time = time.time() - c_logger.print_train_start() - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - c_G, y_G, c_D, y_D = format_data(data) - loader_time = time.time() - end_time - - global_step += 1 - - ############################## - # GENERATOR - ############################## - - # generator pass - y_hat = model_G(c_G) - y_hat_sub = None - y_G_sub = None - y_hat_vis = y_hat # for visualization - - # PQMF formatting - if y_hat.shape[1] > 1: - y_hat_sub = y_hat - y_hat = model_G.pqmf_synthesis(y_hat) - y_hat_vis = y_hat - y_G_sub = model_G.pqmf_analysis(y_G) - - scores_fake, feats_fake, feats_real = None, None, None - if global_step > c.steps_to_start_discriminator: - - # run D with or without cond. features - if len(signature(model_D.forward).parameters) == 2: - D_out_fake = model_D(y_hat, c_G) - else: - D_out_fake = model_D(y_hat) - D_out_real = None - - if c.use_feat_match_loss: - with torch.no_grad(): - D_out_real = model_D(y_G) - - # format D outputs - if isinstance(D_out_fake, tuple): - scores_fake, feats_fake = D_out_fake - if D_out_real is None: - feats_real = None - else: - # we don't need scores for real samples for training G since they are always 1 - _, feats_real = D_out_real - else: - scores_fake = D_out_fake - - # compute losses - loss_G_dict = criterion_G( - y_hat=y_hat, - y=y_G, - scores_fake=scores_fake, - feats_fake=feats_fake, - feats_real=feats_real, - y_hat_sub=y_hat_sub, - y_sub=y_G_sub, - ) - loss_G = loss_G_dict["G_loss"] - - # optimizer generator - optimizer_G.zero_grad() - loss_G.backward() - if c.gen_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_G.parameters(), c.gen_clip_grad) - optimizer_G.step() - - loss_dict = dict() - for key, value in loss_G_dict.items(): - if isinstance(value, int): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - ############################## - # DISCRIMINATOR - ############################## - if global_step >= c.steps_to_start_discriminator: - # discriminator pass - if c.diff_samples_for_G_and_D: - # use a different sample than generator - with torch.no_grad(): - y_hat = model_G(c_D) - - # PQMF formatting - if y_hat.shape[1] > 1: - y_hat = model_G.pqmf_synthesis(y_hat) - else: - # use the same samples as generator - c_D = c_G.clone() - y_D = y_G.clone() - - # run D with or without cond. features - if len(signature(model_D.forward).parameters) == 2: - D_out_fake = model_D(y_hat.detach().clone(), c_D) - D_out_real = model_D(y_D, c_D) - else: - D_out_fake = model_D(y_hat.detach()) - D_out_real = model_D(y_D) - - # format D outputs - if isinstance(D_out_fake, tuple): - # model_D returns scores and features - scores_fake, feats_fake = D_out_fake - if D_out_real is None: - scores_real, feats_real = None, None - else: - scores_real, feats_real = D_out_real - else: - # model D returns only scores - scores_fake = D_out_fake - scores_real = D_out_real - - # compute losses - loss_D_dict = criterion_D(scores_fake, scores_real) - loss_D = loss_D_dict["D_loss"] - - # optimizer discriminator - optimizer_D.zero_grad() - loss_D.backward() - if c.disc_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_D.parameters(), c.disc_clip_grad) - optimizer_D.step() - - for key, value in loss_D_dict.items(): - if isinstance(value, (int, float)): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - step_time = time.time() - start_time - epoch_time += step_time - - # get current learning rates - current_lr_G = list(optimizer_G.param_groups)[0]["lr"] - current_lr_D = list(optimizer_D.param_groups)[0]["lr"] - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - keep_avg.update_values(update_train_values) - - # print training stats - if global_step % c.print_step == 0: - log_dict = { - "step_time": [step_time, 2], - "loader_time": [loader_time, 4], - "current_lr_G": current_lr_G, - "current_lr_D": current_lr_D, - } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # plot step stats - if global_step % 10 == 0: - iter_stats = {"lr_G": current_lr_G, "lr_D": current_lr_D, "step_time": step_time} - iter_stats.update(loss_dict) - tb_logger.tb_train_step_stats(global_step, iter_stats) - - # save checkpoint - if global_step % c.save_step == 0: - if c.checkpoint: - # save model - save_checkpoint( - model_G, - optimizer_G, - scheduler_G, - model_D, - optimizer_D, - scheduler_D, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict, - ) - - # compute spectrograms - figures = plot_results(y_hat_vis, y_G, ap, global_step, "train") - tb_logger.tb_train_figures(global_step, figures) - - # Sample audio - sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_train_audios(global_step, {"train/audio": sample_voice}, c.audio["sample_rate"]) - end_time = time.time() - - if scheduler_G is not None: - scheduler_G.step() - - if scheduler_D is not None: - scheduler_D.step() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Training Epoch Stats - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - if args.rank == 0: - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - # TODO: plot model stats - # if c.tb_model_param_stats: - # tb_logger.tb_model_weights(model, global_step) - torch.cuda.empty_cache() - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch): - data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0)) - model_G.eval() - model_D.eval() - epoch_time = 0 - keep_avg = KeepAverage() - end_time = time.time() - c_logger.print_eval_start() - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - c_G, y_G, _, _ = format_data(data) - loader_time = time.time() - end_time - - global_step += 1 - - ############################## - # GENERATOR - ############################## - - # generator pass - y_hat = model_G(c_G)[:, :, : y_G.size(2)] - y_hat_sub = None - y_G_sub = None - - # PQMF formatting - if y_hat.shape[1] > 1: - y_hat_sub = y_hat - y_hat = model_G.pqmf_synthesis(y_hat) - y_G_sub = model_G.pqmf_analysis(y_G) - - scores_fake, feats_fake, feats_real = None, None, None - if global_step > c.steps_to_start_discriminator: - - if len(signature(model_D.forward).parameters) == 2: - D_out_fake = model_D(y_hat, c_G) - else: - D_out_fake = model_D(y_hat) - D_out_real = None - - if c.use_feat_match_loss: - with torch.no_grad(): - D_out_real = model_D(y_G) - - # format D outputs - if isinstance(D_out_fake, tuple): - scores_fake, feats_fake = D_out_fake - if D_out_real is None: - feats_real = None - else: - _, feats_real = D_out_real - else: - scores_fake = D_out_fake - feats_fake, feats_real = None, None - - # compute losses - loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub) - - loss_dict = dict() - for key, value in loss_G_dict.items(): - if isinstance(value, (int, float)): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - ############################## - # DISCRIMINATOR - ############################## - - if global_step >= c.steps_to_start_discriminator: - # discriminator pass - with torch.no_grad(): - y_hat = model_G(c_G)[:, :, : y_G.size(2)] - - # PQMF formatting - if y_hat.shape[1] > 1: - y_hat = model_G.pqmf_synthesis(y_hat) - - # run D with or without cond. features - if len(signature(model_D.forward).parameters) == 2: - D_out_fake = model_D(y_hat.detach(), c_G) - D_out_real = model_D(y_G, c_G) - else: - D_out_fake = model_D(y_hat.detach()) - D_out_real = model_D(y_G) - - # format D outputs - if isinstance(D_out_fake, tuple): - scores_fake, feats_fake = D_out_fake - if D_out_real is None: - scores_real, feats_real = None, None - else: - scores_real, feats_real = D_out_real - else: - scores_fake = D_out_fake - scores_real = D_out_real - - # compute losses - loss_D_dict = criterion_D(scores_fake, scores_real) - - for key, value in loss_D_dict.items(): - if isinstance(value, (int, float)): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - step_time = time.time() - start_time - epoch_time += step_time - - # update avg stats - update_eval_values = dict() - for key, value in loss_dict.items(): - update_eval_values["avg_" + key] = value - update_eval_values["avg_loader_time"] = loader_time - update_eval_values["avg_step_time"] = step_time - keep_avg.update_values(update_eval_values) - - # print eval stats - if c.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # compute spectrograms - figures = plot_results(y_hat, y_G, ap, global_step, "eval") - tb_logger.tb_eval_figures(global_step, figures) - - # Sample audio - predict_waveform = y_hat[0].squeeze(0).detach().cpu().numpy() - real_waveform = y_G[0].squeeze(0).cpu().numpy() - tb_logger.tb_eval_audios( - global_step, {"eval/audio": predict_waveform, "eval/real_waveformo": real_waveform}, c.audio["sample_rate"] - ) - - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - - # synthesize a full voice - data_loader.return_segments = False - torch.cuda.empty_cache() - return keep_avg.avg_values - - -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global train_data, eval_data - print(f" > Loading wavs from: {c.data_path}") - if c.feature_path is not None: - print(f" > Loading features from: {c.feature_path}") - eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) - else: - eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) - - # setup audio processor - ap = AudioProcessor(**c.audio.to_dict()) - - # DISTRUBUTED - if num_gpus > 1: - init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) - - # setup models - model_gen = setup_generator(c) - model_disc = setup_discriminator(c) - - # setup criterion - criterion_gen = GeneratorLoss(c) - criterion_disc = DiscriminatorLoss(c) - - if use_cuda: - model_gen.cuda() - criterion_gen.cuda() - model_disc.cuda() - criterion_disc.cuda() - - # setup optimizers - # TODO: allow loading custom optimizers - optimizer_gen = None - optimizer_disc = None - optimizer_gen = getattr(torch.optim, c.optimizer) - optimizer_gen = optimizer_gen(model_gen.parameters(), lr=c.lr_gen, **c.optimizer_params) - optimizer_disc = getattr(torch.optim, c.optimizer) - - if c.discriminator_model == "hifigan_discriminator": - optimizer_disc = optimizer_disc( - itertools.chain(model_disc.msd.parameters(), model_disc.mpd.parameters()), - lr=c.lr_disc, - **c.optimizer_params, - ) - else: - optimizer_disc = optimizer_disc(model_disc.parameters(), lr=c.lr_disc, **c.optimizer_params) - - # schedulers - scheduler_gen = None - scheduler_disc = None - if "lr_scheduler_gen" in c: - scheduler_gen = getattr(torch.optim.lr_scheduler, c.lr_scheduler_gen) - scheduler_gen = scheduler_gen(optimizer_gen, **c.lr_scheduler_gen_params) - if "lr_scheduler_disc" in c: - scheduler_disc = getattr(torch.optim.lr_scheduler, c.lr_scheduler_disc) - scheduler_disc = scheduler_disc(optimizer_disc, **c.lr_scheduler_disc_params) - - if args.restore_path: - print(f" > Restoring from {os.path.basename(args.restore_path)}...") - checkpoint = torch.load(args.restore_path, map_location="cpu") - try: - print(" > Restoring Generator Model...") - model_gen.load_state_dict(checkpoint["model"]) - print(" > Restoring Generator Optimizer...") - optimizer_gen.load_state_dict(checkpoint["optimizer"]) - print(" > Restoring Discriminator Model...") - model_disc.load_state_dict(checkpoint["model_disc"]) - print(" > Restoring Discriminator Optimizer...") - optimizer_disc.load_state_dict(checkpoint["optimizer_disc"]) - # restore schedulers if it is a continuing training. - if args.continue_path != "": - if "scheduler" in checkpoint and scheduler_gen is not None: - print(" > Restoring Generator LR Scheduler...") - scheduler_gen.load_state_dict(checkpoint["scheduler"]) - # NOTE: Not sure if necessary - scheduler_gen.optimizer = optimizer_gen - if "scheduler_disc" in checkpoint and scheduler_disc is not None: - print(" > Restoring Discriminator LR Scheduler...") - scheduler_disc.load_state_dict(checkpoint["scheduler_disc"]) - scheduler_disc.optimizer = optimizer_disc - if c.lr_scheduler_disc == "ExponentialLR": - scheduler_disc.last_epoch = checkpoint["epoch"] - except RuntimeError: - # restore only matching layers. - print(" > Partial model initialization...") - model_dict = model_gen.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], c) - model_gen.load_state_dict(model_dict) - - model_dict = model_disc.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model_disc"], c) - model_disc.load_state_dict(model_dict) - del model_dict - - # reset lr if not countinuining training. - if args.continue_path == "": - for group in optimizer_gen.param_groups: - group["lr"] = c.lr_gen - - for group in optimizer_disc.param_groups: - group["lr"] = c.lr_disc - - print(f" > Model restored from step {checkpoint['step']:d}", flush=True) - args.restore_step = checkpoint["step"] - else: - args.restore_step = 0 - - # DISTRUBUTED - if num_gpus > 1: - model_gen = DDP_th(model_gen, device_ids=[args.rank]) - model_disc = DDP_th(model_disc, device_ids=[args.rank]) - - num_params = count_parameters(model_gen) - print(" > Generator has {} parameters".format(num_params), flush=True) - num_params = count_parameters(model_disc) - print(" > Discriminator has {} parameters".format(num_params), flush=True) - - if args.restore_step == 0 or not args.best_path: - best_loss = float("inf") - print(" > Starting with inf best loss.") - else: - print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with best loss of {best_loss}.") - keep_all_best = c.get("keep_all_best", False) - keep_after = c.get("keep_after", 10000) # void if keep_all_best False - - global_step = args.restore_step - for epoch in range(0, c.epochs): - c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train( - model_gen, - criterion_gen, - optimizer_gen, - model_disc, - criterion_disc, - optimizer_disc, - scheduler_gen, - scheduler_disc, - ap, - global_step, - epoch, - ) - eval_avg_loss_dict = evaluate(model_gen, criterion_gen, model_disc, criterion_disc, ap, global_step, epoch) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = eval_avg_loss_dict[c.target_loss] - best_loss = save_best_model( - target_loss, - best_loss, - model_gen, - optimizer_gen, - scheduler_gen, - model_disc, - optimizer_disc, - scheduler_disc, - global_step, - epoch, - OUT_PATH, - keep_all_best=keep_all_best, - keep_after=keep_after, - model_losses=eval_avg_loss_dict, - ) - - -if __name__ == "__main__": - args, c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/TTS/bin/train_vocoder_wavegrad.py b/TTS/bin/train_vocoder_wavegrad.py deleted file mode 100644 index c8f067ee..00000000 --- a/TTS/bin/train_vocoder_wavegrad.py +++ /dev/null @@ -1,431 +0,0 @@ -#!/usr/bin/env python3 -"""Trains WaveGrad vocoder models.""" - -import os -import sys -import time -import traceback - -import numpy as np -import torch - -# DISTRIBUTED -from torch.nn.parallel import DistributedDataParallel as DDP_th -from torch.optim import Adam -from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler - -from TTS.utils.arguments import init_training -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import init_distributed -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.training import setup_torch_training_env -from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data -from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset -from TTS.vocoder.utils.generic_utils import plot_results, setup_generator -from TTS.vocoder.utils.io import save_best_model, save_checkpoint - -use_cuda, num_gpus = setup_torch_training_env(True, True) - - -def setup_loader(ap, is_val=False, verbose=False): - if is_val and not c.run_eval: - loader = None - else: - dataset = WaveGradDataset( - ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad_short=c.pad_short, - conv_pad=c.conv_pad, - is_training=not is_val, - return_segments=True, - use_noise_augment=False, - use_cache=c.use_cache, - verbose=verbose, - ) - sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=c.batch_size, - shuffle=num_gpus <= 1, - drop_last=False, - sampler=sampler, - num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers, - pin_memory=False, - ) - - return loader - - -def format_data(data): - # return a whole audio segment - m, x = data - x = x.unsqueeze(1) - if use_cuda: - m = m.cuda(non_blocking=True) - x = x.cuda(non_blocking=True) - return m, x - - -def format_test_data(data): - # return a whole audio segment - m, x = data - m = m[None, ...] - x = x[None, None, ...] - if use_cuda: - m = m.cuda(non_blocking=True) - x = x.cuda(non_blocking=True) - return m, x - - -def train(model, criterion, optimizer, scheduler, scaler, ap, global_step, epoch): - data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) - model.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / c.batch_size) - end_time = time.time() - c_logger.print_train_start() - # setup noise schedule - noise_schedule = c["train_noise_schedule"] - betas = np.linspace(noise_schedule["min_val"], noise_schedule["max_val"], noise_schedule["num_steps"]) - if hasattr(model, "module"): - model.module.compute_noise_level(betas) - else: - model.compute_noise_level(betas) - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - m, x = format_data(data) - loader_time = time.time() - end_time - - global_step += 1 - - with torch.cuda.amp.autocast(enabled=c.mixed_precision): - # compute noisy input - if hasattr(model, "module"): - noise, x_noisy, noise_scale = model.module.compute_y_n(x) - else: - noise, x_noisy, noise_scale = model.compute_y_n(x) - - # forward pass - noise_hat = model(x_noisy, m, noise_scale) - - # compute losses - loss = criterion(noise, noise_hat) - loss_wavegrad_dict = {"wavegrad_loss": loss} - - # check nan loss - if torch.isnan(loss).any(): - raise RuntimeError(f"Detected NaN loss at step {global_step}.") - - optimizer.zero_grad() - - # backward pass with loss scaling - if c.mixed_precision: - scaler.scale(loss).backward() - scaler.unscale_(optimizer) - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), c.grad_clip) - scaler.step(optimizer) - scaler.update() - else: - loss.backward() - grad_norm = torch.nn.utils.grad_clip_norm_(model.parameters(), c.clip_grad) - optimizer.step() - - # schedule update - if scheduler is not None: - scheduler.step() - - # disconnect loss values - loss_dict = dict() - for key, value in loss_wavegrad_dict.items(): - if isinstance(value, int): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - # epoch/step timing - step_time = time.time() - start_time - epoch_time += step_time - - # get current learning rates - current_lr = list(optimizer.param_groups)[0]["lr"] - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - keep_avg.update_values(update_train_values) - - # print training stats - if global_step % c.print_step == 0: - log_dict = { - "step_time": [step_time, 2], - "loader_time": [loader_time, 4], - "current_lr": current_lr, - "grad_norm": grad_norm.item(), - } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # plot step stats - if global_step % 10 == 0: - iter_stats = {"lr": current_lr, "grad_norm": grad_norm.item(), "step_time": step_time} - iter_stats.update(loss_dict) - tb_logger.tb_train_step_stats(global_step, iter_stats) - - # save checkpoint - if global_step % c.save_step == 0: - if c.checkpoint: - # save model - save_checkpoint( - model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None, - ) - - end_time = time.time() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Training Epoch Stats - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - if args.rank == 0: - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - # TODO: plot model stats - if c.tb_model_param_stats and args.rank == 0: - tb_logger.tb_model_weights(model, global_step) - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(model, criterion, ap, global_step, epoch): - data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0)) - model.eval() - epoch_time = 0 - keep_avg = KeepAverage() - end_time = time.time() - c_logger.print_eval_start() - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - m, x = format_data(data) - loader_time = time.time() - end_time - - global_step += 1 - - # compute noisy input - if hasattr(model, "module"): - noise, x_noisy, noise_scale = model.module.compute_y_n(x) - else: - noise, x_noisy, noise_scale = model.compute_y_n(x) - - # forward pass - noise_hat = model(x_noisy, m, noise_scale) - - # compute losses - loss = criterion(noise, noise_hat) - loss_wavegrad_dict = {"wavegrad_loss": loss} - - loss_dict = dict() - for key, value in loss_wavegrad_dict.items(): - if isinstance(value, (int, float)): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - step_time = time.time() - start_time - epoch_time += step_time - - # update avg stats - update_eval_values = dict() - for key, value in loss_dict.items(): - update_eval_values["avg_" + key] = value - update_eval_values["avg_loader_time"] = loader_time - update_eval_values["avg_step_time"] = step_time - keep_avg.update_values(update_eval_values) - - # print eval stats - if c.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - data_loader.dataset.return_segments = False - samples = data_loader.dataset.load_test_samples(1) - m, x = format_test_data(samples[0]) - - # setup noise schedule and inference - noise_schedule = c["test_noise_schedule"] - betas = np.linspace(noise_schedule["min_val"], noise_schedule["max_val"], noise_schedule["num_steps"]) - if hasattr(model, "module"): - model.module.compute_noise_level(betas) - # compute voice - x_pred = model.module.inference(m) - else: - model.compute_noise_level(betas) - # compute voice - x_pred = model.inference(m) - - # compute spectrograms - figures = plot_results(x_pred, x, ap, global_step, "eval") - tb_logger.tb_eval_figures(global_step, figures) - - # Sample audio - sample_voice = x_pred[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_eval_audios(global_step, {"eval/audio": sample_voice}, c.audio["sample_rate"]) - - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - data_loader.dataset.return_segments = True - - return keep_avg.avg_values - - -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global train_data, eval_data - print(f" > Loading wavs from: {c.data_path}") - if c.feature_path is not None: - print(f" > Loading features from: {c.feature_path}") - eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) - else: - eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) - - # setup audio processor - ap = AudioProcessor(**c.audio.to_dict()) - - # DISTRUBUTED - if num_gpus > 1: - init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) - - # setup models - model = setup_generator(c) - - # scaler for mixed_precision - scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None - - # setup optimizers - optimizer = Adam(model.parameters(), lr=c.lr, weight_decay=0) - - # schedulers - scheduler = None - if "lr_scheduler" in c: - scheduler = getattr(torch.optim.lr_scheduler, c.lr_scheduler) - scheduler = scheduler(optimizer, **c.lr_scheduler_params) - - # setup criterion - criterion = torch.nn.L1Loss().cuda() - - if use_cuda: - model.cuda() - criterion.cuda() - - if args.restore_path: - print(f" > Restoring from {os.path.basename(args.restore_path)}...") - checkpoint = torch.load(args.restore_path, map_location="cpu") - try: - print(" > Restoring Model...") - model.load_state_dict(checkpoint["model"]) - print(" > Restoring Optimizer...") - optimizer.load_state_dict(checkpoint["optimizer"]) - if "scheduler" in checkpoint: - print(" > Restoring LR Scheduler...") - scheduler.load_state_dict(checkpoint["scheduler"]) - # NOTE: Not sure if necessary - scheduler.optimizer = optimizer - if "scaler" in checkpoint and c.mixed_precision: - print(" > Restoring AMP Scaler...") - scaler.load_state_dict(checkpoint["scaler"]) - except RuntimeError: - # retore only matching layers. - print(" > Partial model initialization...") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], c) - model.load_state_dict(model_dict) - del model_dict - - # reset lr if not countinuining training. - for group in optimizer.param_groups: - group["lr"] = c.lr - - print(" > Model restored from step %d" % checkpoint["step"], flush=True) - args.restore_step = checkpoint["step"] - else: - args.restore_step = 0 - - # DISTRUBUTED - if num_gpus > 1: - model = DDP_th(model, device_ids=[args.rank]) - - num_params = count_parameters(model) - print(" > WaveGrad has {} parameters".format(num_params), flush=True) - - if args.restore_step == 0 or not args.best_path: - best_loss = float("inf") - print(" > Starting with inf best loss.") - else: - print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {best_loss}.") - keep_all_best = c.get("keep_all_best", False) - keep_after = c.get("keep_after", 10000) # void if keep_all_best False - - global_step = args.restore_step - for epoch in range(0, c.epochs): - c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train(model, criterion, optimizer, scheduler, scaler, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = eval_avg_loss_dict[c.target_loss] - best_loss = save_best_model( - target_loss, - best_loss, - model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - keep_all_best=keep_all_best, - keep_after=keep_after, - model_losses=eval_avg_loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None, - ) - - -if __name__ == "__main__": - args, c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/TTS/bin/train_vocoder_wavernn.py b/TTS/bin/train_vocoder_wavernn.py deleted file mode 100644 index 86a1506a..00000000 --- a/TTS/bin/train_vocoder_wavernn.py +++ /dev/null @@ -1,431 +0,0 @@ -#!/usr/bin/env python3 -"""Train WaveRNN vocoder model.""" - -import os -import random -import sys -import time -import traceback - -import torch -from torch.utils.data import DataLoader - -from TTS.tts.utils.visual import plot_spectrogram -from TTS.utils.arguments import init_training -from TTS.utils.audio import AudioProcessor -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.radam import RAdam -from TTS.utils.training import setup_torch_training_env -from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data -from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset -from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss -from TTS.vocoder.utils.generic_utils import setup_generator -from TTS.vocoder.utils.io import save_best_model, save_checkpoint - -# from torch.utils.data.distributed import DistributedSampler - - -use_cuda, num_gpus = setup_torch_training_env(True, True) - - -def setup_loader(ap, is_val=False, verbose=False): - if is_val and not c.run_eval: - loader = None - else: - dataset = WaveRNNDataset( - ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad=c.padding, - mode=c.mode, - mulaw=c.mulaw, - is_training=not is_val, - verbose=verbose, - ) - # sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - shuffle=True, - collate_fn=dataset.collate, - batch_size=c.batch_size, - num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers, - pin_memory=True, - ) - return loader - - -def format_data(data): - # setup input data - x_input = data[0] - mels = data[1] - y_coarse = data[2] - - # dispatch data to GPU - if use_cuda: - x_input = x_input.cuda(non_blocking=True) - mels = mels.cuda(non_blocking=True) - y_coarse = y_coarse.cuda(non_blocking=True) - - return x_input, mels, y_coarse - - -def train(model, optimizer, criterion, scheduler, scaler, ap, global_step, epoch): - # create train loader - data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) - model.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / c.batch_size) - end_time = time.time() - c_logger.print_train_start() - # train loop - for num_iter, data in enumerate(data_loader): - start_time = time.time() - x_input, mels, y_coarse = format_data(data) - loader_time = time.time() - end_time - global_step += 1 - - optimizer.zero_grad() - - if c.mixed_precision: - # mixed precision training - with torch.cuda.amp.autocast(): - y_hat = model(x_input, mels) - if isinstance(model.mode, int): - y_hat = y_hat.transpose(1, 2).unsqueeze(-1) - else: - y_coarse = y_coarse.float() - y_coarse = y_coarse.unsqueeze(-1) - # compute losses - loss = criterion(y_hat, y_coarse) - scaler.scale(loss).backward() - scaler.unscale_(optimizer) - if c.grad_clip > 0: - torch.nn.utils.clip_grad_norm_(model.parameters(), c.grad_clip) - scaler.step(optimizer) - scaler.update() - else: - # full precision training - y_hat = model(x_input, mels) - if isinstance(model.mode, int): - y_hat = y_hat.transpose(1, 2).unsqueeze(-1) - else: - y_coarse = y_coarse.float() - y_coarse = y_coarse.unsqueeze(-1) - # compute losses - loss = criterion(y_hat, y_coarse) - if loss.item() is None: - raise RuntimeError(" [!] None loss. Exiting ...") - loss.backward() - if c.grad_clip > 0: - torch.nn.utils.clip_grad_norm_(model.parameters(), c.grad_clip) - optimizer.step() - - if scheduler is not None: - scheduler.step() - - # get the current learning rate - cur_lr = list(optimizer.param_groups)[0]["lr"] - - step_time = time.time() - start_time - epoch_time += step_time - - update_train_values = dict() - loss_dict = dict() - loss_dict["model_loss"] = loss.item() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - keep_avg.update_values(update_train_values) - - # print training stats - if global_step % c.print_step == 0: - log_dict = { - "step_time": [step_time, 2], - "loader_time": [loader_time, 4], - "current_lr": cur_lr, - } - c_logger.print_train_step( - batch_n_iter, - num_iter, - global_step, - log_dict, - loss_dict, - keep_avg.avg_values, - ) - - # plot step stats - if global_step % 10 == 0: - iter_stats = {"lr": cur_lr, "step_time": step_time} - iter_stats.update(loss_dict) - tb_logger.tb_train_step_stats(global_step, iter_stats) - - # save checkpoint - if global_step % c.save_step == 0: - if c.checkpoint: - # save model - save_checkpoint( - model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None, - ) - - # synthesize a full voice - rand_idx = random.randrange(0, len(train_data)) - wav_path = ( - train_data[rand_idx] if not isinstance(train_data[rand_idx], (tuple, list)) else train_data[rand_idx][0] - ) - wav = ap.load_wav(wav_path) - ground_mel = ap.melspectrogram(wav) - ground_mel = torch.FloatTensor(ground_mel) - if use_cuda: - ground_mel = ground_mel.cuda(non_blocking=True) - sample_wav = model.inference( - ground_mel, - c.batched, - c.target_samples, - c.overlap_samples, - ) - predict_mel = ap.melspectrogram(sample_wav) - - # compute spectrograms - figures = { - "train/ground_truth": plot_spectrogram(ground_mel.T), - "train/prediction": plot_spectrogram(predict_mel.T), - } - tb_logger.tb_train_figures(global_step, figures) - - # Sample audio - tb_logger.tb_train_audios(global_step, {"train/audio": sample_wav}, c.audio["sample_rate"]) - end_time = time.time() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Training Epoch Stats - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - # TODO: plot model stats - # if c.tb_model_param_stats: - # tb_logger.tb_model_weights(model, global_step) - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(model, criterion, ap, global_step, epoch): - # create train loader - data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0)) - model.eval() - epoch_time = 0 - keep_avg = KeepAverage() - end_time = time.time() - c_logger.print_eval_start() - with torch.no_grad(): - for num_iter, data in enumerate(data_loader): - start_time = time.time() - # format data - x_input, mels, y_coarse = format_data(data) - loader_time = time.time() - end_time - global_step += 1 - - y_hat = model(x_input, mels) - if isinstance(model.mode, int): - y_hat = y_hat.transpose(1, 2).unsqueeze(-1) - else: - y_coarse = y_coarse.float() - y_coarse = y_coarse.unsqueeze(-1) - loss = criterion(y_hat, y_coarse) - # Compute avg loss - # if num_gpus > 1: - # loss = reduce_tensor(loss.data, num_gpus) - loss_dict = dict() - loss_dict["model_loss"] = loss.item() - - step_time = time.time() - start_time - epoch_time += step_time - - # update avg stats - update_eval_values = dict() - for key, value in loss_dict.items(): - update_eval_values["avg_" + key] = value - update_eval_values["avg_loader_time"] = loader_time - update_eval_values["avg_step_time"] = step_time - keep_avg.update_values(update_eval_values) - - # print eval stats - if c.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if epoch % c.test_every_epochs == 0 and epoch != 0: - # synthesize a full voice - rand_idx = random.randrange(0, len(eval_data)) - wav_path = eval_data[rand_idx] if not isinstance(eval_data[rand_idx], (tuple, list)) else eval_data[rand_idx][0] - wav = ap.load_wav(wav_path) - ground_mel = ap.melspectrogram(wav) - ground_mel = torch.FloatTensor(ground_mel) - if use_cuda: - ground_mel = ground_mel.cuda(non_blocking=True) - sample_wav = model.inference( - ground_mel, - c.batched, - c.target_samples, - c.overlap_samples, - ) - predict_mel = ap.melspectrogram(sample_wav) - - # Sample audio - tb_logger.tb_eval_audios(global_step, {"eval/audio": sample_wav}, c.audio["sample_rate"]) - - # compute spectrograms - figures = { - "eval/ground_truth": plot_spectrogram(ground_mel.T), - "eval/prediction": plot_spectrogram(predict_mel.T), - } - tb_logger.tb_eval_figures(global_step, figures) - - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - return keep_avg.avg_values - - -# FIXME: move args definition/parsing inside of main? -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global train_data, eval_data - - # setup audio processor - ap = AudioProcessor(**c.audio.to_dict()) - - print(f" > Loading wavs from: {c.data_path}") - if c.feature_path is not None: - print(f" > Loading features from: {c.feature_path}") - eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) - else: - eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) - # setup model - model_wavernn = setup_generator(c) - - # setup amp scaler - scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None - - # define train functions - if c.mode == "mold": - criterion = discretized_mix_logistic_loss - elif c.mode == "gauss": - criterion = gaussian_loss - elif isinstance(c.mode, int): - criterion = torch.nn.CrossEntropyLoss() - - if use_cuda: - model_wavernn.cuda() - if isinstance(c.mode, int): - criterion.cuda() - - optimizer = RAdam(model_wavernn.parameters(), lr=c.lr, weight_decay=0) - - scheduler = None - if "lr_scheduler" in c: - scheduler = getattr(torch.optim.lr_scheduler, c.lr_scheduler) - scheduler = scheduler(optimizer, **c.lr_scheduler_params) - # slow start for the first 5 epochs - # lr_lambda = lambda epoch: min(epoch / c.warmup_steps, 1) - # scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) - - # restore any checkpoint - if args.restore_path: - print(f" > Restoring from {os.path.basename(args.restore_path)}...") - checkpoint = torch.load(args.restore_path, map_location="cpu") - try: - print(" > Restoring Model...") - model_wavernn.load_state_dict(checkpoint["model"]) - print(" > Restoring Optimizer...") - optimizer.load_state_dict(checkpoint["optimizer"]) - if "scheduler" in checkpoint: - print(" > Restoring Generator LR Scheduler...") - scheduler.load_state_dict(checkpoint["scheduler"]) - scheduler.optimizer = optimizer - if "scaler" in checkpoint and c.mixed_precision: - print(" > Restoring AMP Scaler...") - scaler.load_state_dict(checkpoint["scaler"]) - except RuntimeError: - # retore only matching layers. - print(" > Partial model initialization...") - model_dict = model_wavernn.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], c) - model_wavernn.load_state_dict(model_dict) - - print(" > Model restored from step %d" % checkpoint["step"], flush=True) - args.restore_step = checkpoint["step"] - else: - args.restore_step = 0 - - # DISTRIBUTED - # if num_gpus > 1: - # model = apply_gradient_allreduce(model) - - num_parameters = count_parameters(model_wavernn) - print(" > Model has {} parameters".format(num_parameters), flush=True) - - if args.restore_step == 0 or not args.best_path: - best_loss = float("inf") - print(" > Starting with inf best loss.") - else: - print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {best_loss}.") - keep_all_best = c.get("keep_all_best", False) - keep_after = c.get("keep_after", 10000) # void if keep_all_best False - - global_step = args.restore_step - for epoch in range(0, c.epochs): - c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train(model_wavernn, optimizer, criterion, scheduler, scaler, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(model_wavernn, criterion, ap, global_step, epoch) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = eval_avg_loss_dict["avg_model_loss"] - best_loss = save_best_model( - target_loss, - best_loss, - model_wavernn, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - keep_all_best=keep_all_best, - keep_after=keep_after, - model_losses=eval_avg_loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None, - ) - - -if __name__ == "__main__": - args, c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/TTS/trainer.py b/TTS/trainer.py index 5c02fdfb..8b7be3d1 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -1,22 +1,52 @@ # -*- coding: utf-8 -*- +import glob import importlib -from abc import ABC, abstractmethod +import logging +import os +import re +import sys +import time +import traceback +from argparse import Namespace from dataclasses import dataclass, field -from typing import Dict, List, Tuple, TypeVar +from typing import Dict, List, Tuple, Union import torch from coqpit import Coqpit - -# DISTRIBUTED from torch import nn +from torch.nn.parallel import DistributedDataParallel as DDP_th +from torch.utils.data import DataLoader -_DataLoader = TypeVar("_DataLoader") +from TTS.config import load_config +from TTS.tts.datasets import load_meta_data +from TTS.tts.models import setup_model as setup_tts_model +from TTS.tts.utils.text.symbols import parse_symbols +from TTS.utils.audio import AudioProcessor +from TTS.utils.callbacks import TrainerCallback +from TTS.utils.distribute import init_distributed +from TTS.utils.generic_utils import ( + KeepAverage, + count_parameters, + create_experiment_folder, + get_git_branch, + remove_experiment_folder, + set_init_dict, + to_cuda, +) +from TTS.utils.io import copy_model_files, save_best_model, save_checkpoint +from TTS.utils.logging import ConsoleLogger, TensorboardLogger +from TTS.utils.trainer_utils import * +from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data +from TTS.vocoder.models import setup_model as setup_vocoder_model + +if is_apex_available(): + from apex import amp @dataclass class TrainingArgs(Coqpit): - """Trainer arguments that are parsed externally (e.g. CLI)""" + """Trainer arguments""" continue_path: str = field( default="", @@ -41,101 +71,926 @@ class TrainingArgs(Coqpit): group_id: str = field(default="", metadata={"help": "Process group id in distributed training."}) -# pylint: disable=import-outside-toplevel, too-many-public-methods +class Trainer: + def __init__( + self, + args: Union[Coqpit, Namespace], + config: Coqpit, + output_path: str, + c_logger: ConsoleLogger = None, + tb_logger: TensorboardLogger = None, + model: nn.Module = None, + cudnn_benchmark: bool = False, + ) -> None: + """Simple yet powerful 🐸💬 TTS trainer for PyTorch. It can train all the available `tts` and `vocoder` models + or easily be customized. + Notes: -class TrainerAbstract(ABC): + Supports Automatic Mixed Precision training. If `Apex` is availabe, it automatically picks that, else + it uses PyTorch's native `amp` module. `Apex` may provide more stable training in some cases. + + Args: + + args (Union[Coqpit, Namespace]): Training arguments parsed either from console by `argparse` or `TrainingArgs` + config object. + + config (Coqpit): Model config object. It includes all the values necessary for initializing, training, evaluating + and testing the model. + + output_path (str): Path to the output training folder. All the files are saved under thi path. + + c_logger (ConsoleLogger, optional): Console logger for printing training status. If not provided, the default + console logger is used. Defaults to None. + + tb_logger (TensorboardLogger, optional): Tensorboard logger. If not provided, the default logger is used. + Defaults to None. + + model (nn.Module, optional): Initialized and ready-to-train model. If it is not defined, `Trainer` + initializes a model from the provided config. Defaults to None. + + cudnn_benchmark (bool): enable/disable PyTorch cudnn benchmarking. It is better to disable if the model input + length is changing batch to batch along the training. + + Examples: + + Running trainer on a model. + + >>> args = TrainingArgs(...) + >>> config = HifiganConfig(...) + >>> model = GANModel(config) + >>> trainer = Trainer(args, config, output_path, model=model) + >>> trainer.fit() + + Running trainer on a config. + + >>> config = WavegradConfig(data_path="/home/erogol/nvme/gdrive/Datasets/LJSpeech-1.1/wavs/", output_path=output_path,) + >>> args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) + >>> trainer = Trainer(args, config, output_path, c_logger, tb_logger) + >>> trainer.fit() + + TODO: + - Accumulate gradients b/w batches. + - Deepspeed integration + - Profiler integration. + - Overfitting to a batch. + - TPU training + """ + + # set and initialize Pytorch runtime + self.use_cuda, self.num_gpus = setup_torch_training_env(True, cudnn_benchmark) + + if config is None: + # parse config from console arguments + config, output_path, _, c_logger, tb_logger = process_args(args) + + self.output_path = output_path + self.args = args + self.config = config + + # init loggers + self.c_logger = ConsoleLogger() if c_logger is None else c_logger + if tb_logger is None: + self.tb_logger = TensorboardLogger(output_path, model_name=config.model) + self.tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) + else: + self.tb_logger = tb_logger + log_file = os.path.join(self.output_path, f"trainer_{args.rank}_log.txt") + self._setup_logger_config(log_file) + + self.total_steps_done = 0 + self.epochs_done = 0 + self.restore_step = 0 + self.best_loss = float("inf") + self.train_loader = None + self.eval_loader = None + self.output_audio_path = os.path.join(output_path, "test_audios") + + self.keep_avg_train = None + self.keep_avg_eval = None + + self.use_apex = self._is_apex_available() + self.use_amp_scaler = self.config.mixed_precision and self.use_cuda + + # init audio processor + self.ap = AudioProcessor(**self.config.audio.to_dict()) + + # load dataset samples + # TODO: refactor this + if "datasets" in self.config: + # load data for `tts` models + self.data_train, self.data_eval = load_meta_data(self.config.datasets) + elif self.config.feature_path is not None: + # load data for `vocoder`models + print(f" > Loading features from: {self.config.feature_path}") + self.data_eval, self.data_train = load_wav_feat_data( + self.config.data_path, self.config.feature_path, self.config.eval_split_size + ) + else: + # load data for `vocoder`models + self.data_eval, self.data_train = load_wav_data(self.config.data_path, self.config.eval_split_size) + + # init TTS model + if model is not None: + self.model = model + else: + self.model = self.get_model(self.config) + + # setup criterion + self.criterion = self.get_criterion(self.model) + + # DISTRUBUTED + if self.num_gpus > 1: + init_distributed( + args.rank, + self.num_gpus, + args.group_id, + self.config.distributed_backend, + self.config.distributed_url, + ) + + if self.use_cuda: + self.model.cuda() + if isinstance(self.criterion, list): + self.criterion = [x.cuda() for x in self.criterion] + else: + self.criterion.cuda() + + # setup optimizer + self.optimizer = self.get_optimizer(self.model, self.config) + + # callback + self.callbacks = TrainerCallback(self) + self.callbacks.on_init_start() + + # init AMP + if self.use_amp_scaler: + if self.use_apex: + self.scaler = None + self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level="O1") + if isinstance(self.optimizer, list): + self.scaler = [torch.cuda.amp.GradScaler()] * len(self.optimizer) + else: + self.scaler = torch.cuda.amp.GradScaler() + else: + self.scaler = None + + if self.args.restore_path: + self.model, self.optimizer, self.scaler, self.restore_step = self.restore_model( + self.config, args.restore_path, self.model, self.optimizer, self.scaler + ) + + # setup scheduler + self.scheduler = self.get_scheduler(self.model, self.config, self.optimizer) + + # DISTRUBUTED + if self.num_gpus > 1: + self.model = DDP_th(self.model, device_ids=[args.rank], output_device=args.rank) + + # count model size + num_params = count_parameters(self.model) + print("\n > Model has {} parameters".format(num_params)) + + self.callbacks.on_init_end() + + @staticmethod + def get_model(config: Coqpit) -> nn.Module: + """Initialize model from config. + + Args: + config (Coqpit): Model config. + + Returns: + nn.Module: initialized model. + """ + # TODO: better model setup + try: + model = setup_tts_model(config) + except ModuleNotFoundError: + model = setup_vocoder_model(config) + return model + + def restore_model( + self, + config: Coqpit, + restore_path: str, + model: nn.Module, + optimizer: torch.optim.Optimizer, + scaler: torch.cuda.amp.GradScaler = None, + ) -> Tuple[nn.Module, torch.optim.Optimizer, torch.cuda.amp.GradScaler, int]: + """Restore training from an old run. It restores model, optimizer, AMP scaler and training stats. + + Args: + config (Coqpit): Model config. + restore_path (str): Path to the restored training run. + model (nn.Module): Model to restored. + optimizer (torch.optim.Optimizer): Optimizer to restore. + scaler (torch.cuda.amp.GradScaler, optional): AMP scaler to restore. Defaults to None. + + Returns: + Tuple[nn.Module, torch.optim.Optimizer, torch.cuda.amp.GradScaler, int]: [description] + """ + + def _restore_list_objs(states, obj): + if isinstance(obj, list): + for idx, state in enumerate(states): + obj[idx].load_state_dict(state) + else: + obj.load_state_dict(states) + return obj + + print(" > Restoring from %s ..." % os.path.basename(restore_path)) + checkpoint = torch.load(restore_path) + try: + print(" > Restoring Model...") + model.load_state_dict(checkpoint["model"]) + print(" > Restoring Optimizer...") + optimizer = _restore_list_objs(checkpoint["optimizer"], optimizer) + if "scaler" in checkpoint and self.use_amp_scaler: + print(" > Restoring AMP Scaler...") + scaler = _restore_list_objs(checkpoint["scaler"], scaler) + except (KeyError, RuntimeError): + print(" > Partial model initialization...") + model_dict = model.state_dict() + model_dict = set_init_dict(model_dict, checkpoint["model"], config) + model.load_state_dict(model_dict) + del model_dict + + if isinstance(self.optimizer, list): + for idx, optim in enumerate(optimizer): + for group in optim.param_groups: + group["lr"] = self.get_lr(model, config)[idx] + else: + for group in optimizer.param_groups: + group["lr"] = self.get_lr(model, config) + print( + " > Model restored from step %d" % checkpoint["step"], + ) + restore_step = checkpoint["step"] + return model, optimizer, scaler, restore_step + + @staticmethod + def _get_loader( + model: nn.Module, + config: Coqpit, + ap: AudioProcessor, + is_eval: bool, + data_items: List, + verbose: bool, + num_gpus: int, + ) -> DataLoader: + if hasattr(model, "get_data_loader"): + loader = model.get_data_loader(config, ap, is_eval, data_items, verbose, num_gpus) + return loader + + def get_train_dataloader(self, ap: AudioProcessor, data_items: List, verbose: bool) -> DataLoader: + """Initialize and return a training data loader. + + Args: + ap (AudioProcessor): Audio processor. + data_items (List): Data samples used for training. + verbose (bool): enable/disable printing loader stats at initialization. + + Returns: + DataLoader: Initialized training data loader. + """ + return self._get_loader(self.model, self.config, ap, False, data_items, verbose, self.num_gpus) + + def get_eval_dataloader(self, ap: AudioProcessor, data_items: List, verbose: bool) -> DataLoader: + return self._get_loader(self.model, self.config, ap, True, data_items, verbose, self.num_gpus) + + def format_batch(self, batch: List) -> Dict: + """Format dataloader ouput and return a batch. + + Args: + batch (List): Batch returned by the dataloader. + + Returns: + Dict: Formatted batch. + """ + batch = self.model.format_batch(batch) + if self.use_cuda: + for k, v in batch.items(): + batch[k] = to_cuda(v) + return batch + + @staticmethod + def _model_train_step( + batch: Dict, model: nn.Module, criterion: nn.Module, optimizer_idx: int = None + ) -> Tuple[Dict, Dict]: + """ + Perform a trainig forward step. Compute model outputs and losses. + + Args: + batch (Dict): [description] + model (nn.Module): [description] + criterion (nn.Module): [description] + optimizer_idx (int, optional): [description]. Defaults to None. + + Returns: + Tuple[Dict, Dict]: [description] + """ + input_args = [batch, criterion] + if optimizer_idx is not None: + input_args.append(optimizer_idx) + # unwrap model in DDP training + if hasattr(model, "module"): + return model.module.train_step(*input_args) + return model.train_step(*input_args) + + def _optimize( + self, + batch: Dict, + model: nn.Module, + optimizer: Union[torch.optim.Optimizer, List], + scaler: "AMPScaler", + criterion: nn.Module, + scheduler: Union[torch.optim.lr_scheduler._LRScheduler, List], # pylint: disable=protected-access + config: Coqpit, + optimizer_idx: int = None, + ) -> Tuple[Dict, Dict, int, torch.Tensor]: + """Perform a forward - backward pass and run the optimizer. + + Args: + batch (Dict): Input batch. If + model (nn.Module): Model for training. Defaults to None. + optimizer (Union[nn.optim.Optimizer, List]): Model's optimizer. If it is a list then, `optimizer_idx` must be defined to indicate the optimizer in use. + scaler (AMPScaler): AMP scaler. + criterion (nn.Module): Model's criterion. + scheduler (Union[torch.optim.lr_scheduler._LRScheduler, List]): LR scheduler used by the optimizer. + config (Coqpit): Model config. + optimizer_idx (int, optional): Target optimizer being used. Defaults to None. + + Raises: + RuntimeError: When the loss is NaN. + + Returns: + Tuple[Dict, Dict, int, torch.Tensor]: model outputs, losses, step time and gradient norm. + """ + step_start_time = time.time() + # zero-out optimizer + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=config.mixed_precision): + if optimizer_idx is not None: + outputs, loss_dict = self._model_train_step(batch, model, criterion, optimizer_idx=optimizer_idx) + else: + outputs, loss_dict = self._model_train_step(batch, model, criterion) + + # skip the rest + if outputs is None: + step_time = time.time() - step_start_time + return None, {}, step_time, 0 + + # check nan loss + if torch.isnan(loss_dict["loss"]).any(): + raise RuntimeError(f"Detected NaN loss at step {self.total_steps_done}.") + + # set gradient clipping threshold + if "grad_clip" in config and config.grad_clip is not None: + if optimizer_idx is not None: + grad_clip = config.grad_clip[optimizer_idx] + else: + grad_clip = config.grad_clip + else: + grad_clip = 0.0 # meaning no gradient clipping + + # TODO: compute grad norm + if grad_clip <= 0: + grad_norm = 0 + + # optimizer step + update_lr_scheduler = True + if self.use_amp_scaler: + if self.use_apex: + with amp.scale_loss(loss_dict["loss"], self.optimizer) as scaled_loss: + scaled_loss.backward() + grad_norm = torch.nn.utils.clip_grad_norm_( + amp.master_params(self.optimizer), + self.config.grad_clip, + ) + else: + # model optimizer step in mixed precision mode + scaler.scale(loss_dict["loss"]).backward() + scaler.unscale_(optimizer) + if grad_clip > 0: + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) + scale_prev = scaler.get_scale() + scaler.step(optimizer) + scaler.update() + update_lr_scheduler = scale_prev <= scaler.get_scale() + else: + # main model optimizer step + loss_dict["loss"].backward() + if grad_clip > 0: + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) + optimizer.step() + + step_time = time.time() - step_start_time + + # setup lr + if scheduler is not None and update_lr_scheduler: + scheduler.step() + + # detach losses + loss_dict = self._detach_loss_dict(loss_dict) + if optimizer_idx is not None: + loss_dict[f"loss_{optimizer_idx}"] = loss_dict.pop("loss") + loss_dict[f"grad_norm_{optimizer_idx}"] = grad_norm + return outputs, loss_dict, step_time, grad_norm + + @staticmethod + def _detach_loss_dict(loss_dict: Dict) -> Dict: + """Detach loss values from autograp. + + Args: + loss_dict (Dict): losses. + + Returns: + Dict: losses detached from autograph. + """ + loss_dict_detached = {} + for key, value in loss_dict.items(): + if isinstance(value, (int, float)): + loss_dict_detached[key] = value + else: + loss_dict_detached[key] = value.item() + return loss_dict_detached + + def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_time: float) -> Tuple[Dict, Dict]: + """Perform a training step on a batch of inputs and log the process. + + Args: + batch (Dict): Input batch. + batch_n_steps (int): Number of steps needed to complete an epoch. Needed for logging. + step (int): Current step number in this epoch. + loader_start_time (float): The time when the data loading is started. Needed for logging. + + Returns: + Tuple[Dict, Dict]: Model outputs and losses. + """ + self.callbacks.on_train_step_start() + # format data + batch = self.format_batch(batch) + loader_time = time.time() - loader_start_time + + # conteainers to hold model outputs and losses for each optimizer. + outputs_per_optimizer = None + log_dict = {} + loss_dict = {} + if not isinstance(self.optimizer, list): + # training with a single optimizer + outputs, loss_dict_new, step_time, grad_norm = self._optimize( + batch, self.model, self.optimizer, self.scaler, self.criterion, self.scheduler, self.config + ) + loss_dict.update(loss_dict_new) + else: + # training with multiple optimizers (e.g. GAN) + outputs_per_optimizer = [None] * len(self.optimizer) + total_step_time = 0 + for idx, optimizer in enumerate(self.optimizer): + criterion = self.criterion + scaler = self.scaler[idx] if self.use_amp_scaler else None + scheduler = self.scheduler[idx] + outputs, loss_dict_new, step_time, grad_norm = self._optimize( + batch, self.model, optimizer, scaler, criterion, scheduler, self.config, idx + ) + # skip the rest if the model returns None + total_step_time += step_time + outputs_per_optimizer[idx] = outputs + # if None, model skipped this optimizer + if loss_dict_new is not None: + loss_dict.update(loss_dict_new) + outputs = outputs_per_optimizer + + # update avg stats + keep_avg_update = dict() + for key, value in log_dict.items(): + keep_avg_update["avg_" + key] = value + keep_avg_update["avg_loader_time"] = loader_time + keep_avg_update["avg_step_time"] = step_time + self.keep_avg_train.update_values(keep_avg_update) + + # print training progress + if self.total_steps_done % self.config.print_step == 0: + # log learning rates + lrs = {} + if isinstance(self.optimizer, list): + for idx, optimizer in enumerate(self.optimizer): + current_lr = self.optimizer[idx].param_groups[0]["lr"] + lrs.update({f"current_lr_{idx}": current_lr}) + else: + current_lr = self.optimizer.param_groups[0]["lr"] + lrs = {"current_lr": current_lr} + log_dict.update(lrs) + if grad_norm > 0: + log_dict.update({"grad_norm": grad_norm}) + # log run-time stats + log_dict.update( + { + "step_time": round(step_time, 4), + "loader_time": round(loader_time, 4), + } + ) + self.c_logger.print_train_step( + batch_n_steps, step, self.total_steps_done, log_dict, loss_dict, self.keep_avg_train.avg_values + ) + + if self.args.rank == 0: + # Plot Training Iter Stats + # reduce TB load and don't log every step + if self.total_steps_done % self.config.tb_plot_step == 0: + iter_stats = log_dict + iter_stats.update(loss_dict) + self.tb_logger.tb_train_step_stats(self.total_steps_done, iter_stats) + if self.total_steps_done % self.config.save_step == 0 and self.total_steps_done != 0: + if self.config.checkpoint: + # checkpoint the model + model_loss = ( + loss_dict[self.config.target_loss] if "target_loss" in self.config else loss_dict["loss"] + ) + save_checkpoint( + self.config, + self.model, + self.optimizer, + self.scaler if self.use_amp_scaler else None, + self.total_steps_done, + self.epochs_done, + self.output_path, + model_loss=model_loss, + ) + # training visualizations + figures, audios = None, None + if hasattr(self.model, "module") and hasattr(self.model.module, "train_log"): + figures, audios = self.model.module.train_log(self.ap, batch, outputs) + elif hasattr(self.model, "train_log"): + figures, audios = self.model.train_log(self.ap, batch, outputs) + if figures is not None: + self.tb_logger.tb_train_figures(self.total_steps_done, figures) + if audios is not None: + self.tb_logger.tb_train_audios(self.total_steps_done, audios, self.ap.sample_rate) + self.total_steps_done += 1 + self.callbacks.on_train_step_end() + return outputs, loss_dict + + def train_epoch(self) -> None: + """Main entry point for training. Run training on the whole training samples.""" + self.train_loader = self.get_train_dataloader( + self.ap, + self.data_train, + verbose=True, + ) + self.model.train() + epoch_start_time = time.time() + if self.use_cuda: + batch_num_steps = int(len(self.train_loader.dataset) / (self.config.batch_size * self.num_gpus)) + else: + batch_num_steps = int(len(self.train_loader.dataset) / self.config.batch_size) + self.c_logger.print_train_start() + for cur_step, batch in enumerate(self.train_loader): + loader_start_time = time.time() + _, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time) + epoch_time = time.time() - epoch_start_time + # Plot self.epochs_done Stats + if self.args.rank == 0: + epoch_stats = {"epoch_time": epoch_time} + epoch_stats.update(self.keep_avg_train.avg_values) + self.tb_logger.tb_train_epoch_stats(self.total_steps_done, epoch_stats) + if self.config.tb_model_param_stats: + self.tb_logger.tb_model_weights(self.model, self.total_steps_done) + + @staticmethod + def _model_eval_step( + batch: Dict, model: nn.Module, criterion: nn.Module, optimizer_idx: int = None + ) -> Tuple[Dict, Dict]: + """ + Perform a evaluation forward pass. Compute model outputs and losses with no gradients. + + Args: + batch (Dict): IBatch of inputs. + model (nn.Module): Model to call evaluation. + criterion (nn.Module): Model criterion. + optimizer_idx (int, optional): Optimizer ID to define the closure in multi-optimizer training. Defaults to None. + + Returns: + Tuple[Dict, Dict]: model outputs and losses. + """ + input_args = [batch, criterion] + if optimizer_idx is not None: + input_args.append(optimizer_idx) + if hasattr(model, "module"): + return model.module.eval_step(*input_args) + return model.eval_step(*input_args) + + def eval_step(self, batch: Dict, step: int) -> Tuple[Dict, Dict]: + with torch.no_grad(): + outputs_per_optimizer = None + loss_dict = {} + if not isinstance(self.optimizer, list): + outputs, loss_dict = self._model_eval_step(batch, self.model, self.criterion) + else: + outputs_per_optimizer = [None] * len(self.optimizer) + for idx, _ in enumerate(self.optimizer): + criterion = self.criterion + outputs, loss_dict_new = self._model_eval_step(batch, self.model, criterion, idx) + outputs_per_optimizer[idx] = outputs + if loss_dict_new is not None: + loss_dict.update(loss_dict_new) + outputs = outputs_per_optimizer + + # update avg stats + update_eval_values = dict() + for key, value in loss_dict.items(): + update_eval_values["avg_" + key] = value + self.keep_avg_eval.update_values(update_eval_values) + + if self.config.print_eval: + self.c_logger.print_eval_step(step, loss_dict, self.keep_avg_eval.avg_values) + return outputs, loss_dict + + def eval_epoch(self) -> None: + self.eval_loader = ( + self.get_eval_dataloader( + self.ap, + self.data_eval, + verbose=True, + ) + if self.config.run_eval + else None + ) + + self.model.eval() + self.c_logger.print_eval_start() + loader_start_time = time.time() + batch = None + for cur_step, batch in enumerate(self.eval_loader): + # format data + batch = self.format_batch(batch) + loader_time = time.time() - loader_start_time + self.keep_avg_eval.update_values({"avg_loader_time": loader_time}) + outputs, _ = self.eval_step(batch, cur_step) + # plot epoch stats, artifacts and figures + if self.args.rank == 0: + figures, audios = None, None + if hasattr(self.model, "module") and hasattr(self.model.module, "eval_log"): + figures, audios = self.model.module.eval_log(self.ap, batch, outputs) + elif hasattr(self.model, "eval_log"): + figures, audios = self.model.eval_log(self.ap, batch, outputs) + if figures is not None: + self.tb_logger.tb_eval_figures(self.total_steps_done, figures) + if audios is not None: + self.tb_logger.tb_eval_audios(self.total_steps_done, audios, self.ap.sample_rate) + + def test_run(self) -> None: + """Run test and log the results. Test run must be defined by the model. + Model must return figures and audios to be logged by the Tensorboard logger.""" + if hasattr(self.model, "test_run"): + if hasattr(self.eval_loader.load_test_samples): + samples = self.eval_loader.load_test_samples(1) + figures, audios = self.model.test_run(samples) + else: + figures, audios = self.model.test_run() + self.tb_logger.tb_test_audios(self.total_steps_done, audios, self.config.audio["sample_rate"]) + self.tb_logger.tb_test_figures(self.total_steps_done, figures) + + def _fit(self) -> None: + """🏃 train -> evaluate -> test for the number of epochs.""" + if self.restore_step != 0 or self.args.best_path: + print(" > Restoring best loss from " f"{os.path.basename(self.args.best_path)} ...") + self.best_loss = torch.load(self.args.best_path, map_location="cpu")["model_loss"] + print(f" > Starting with loaded last best loss {self.best_loss}.") + + self.total_steps_done = self.restore_step + + for epoch in range(0, self.config.epochs): + self.callbacks.on_epoch_start() + self.keep_avg_train = KeepAverage() + self.keep_avg_eval = KeepAverage() if self.config.run_eval else None + self.epochs_done = epoch + self.c_logger.print_epoch_start(epoch, self.config.epochs) + self.train_epoch() + if self.config.run_eval: + self.eval_epoch() + if epoch >= self.config.test_delay_epochs and self.args.rank < 0: + self.test_run() + self.c_logger.print_epoch_end( + epoch, self.keep_avg_eval.avg_values if self.config.run_eval else self.keep_avg_train.avg_values + ) + self.save_best_model() + self.callbacks.on_epoch_end() + + def fit(self) -> None: + """Where the ✨️magic✨️ happens...""" + try: + self._fit() + except KeyboardInterrupt: + self.callbacks.on_keyboard_interrupt() + # if the output folder is empty remove the run. + remove_experiment_folder(self.output_path) + # stop without error signal + try: + sys.exit(0) + except SystemExit: + os._exit(0) # pylint: disable=protected-access + except BaseException: # pylint: disable=broad-except + remove_experiment_folder(self.output_path) + traceback.print_exc() + sys.exit(1) + + def save_best_model(self) -> None: + """Save the best model. It only saves if the current target loss is smaller then the previous.""" + self.best_loss = save_best_model( + self.keep_avg_eval["avg_loss"] if self.keep_avg_eval else self.keep_avg_train["avg_loss"], + self.best_loss, + self.config, + self.model, + self.optimizer, + self.scaler if self.use_amp_scaler else None, + self.total_steps_done, + self.epochs_done, + self.output_path, + keep_all_best=self.config.keep_all_best, + keep_after=self.config.keep_after, + ) + + @staticmethod + def _setup_logger_config(log_file: str) -> None: + logging.basicConfig( + level=logging.INFO, format="", handlers=[logging.FileHandler(log_file), logging.StreamHandler()] + ) @staticmethod def _is_apex_available(): return importlib.util.find_spec("apex") is not None @staticmethod - @abstractmethod - def get_model(*args, **kwargs) -> nn.Module: - pass + def get_optimizer(model: nn.Module, config: Coqpit) -> Union[torch.optim.Optimizer, List]: + if hasattr(model, "get_optimizer"): + optimizer = model.get_optimizer() + if optimizer is None: + optimizer_name = config.optimizer + optimizer_params = config.optimizer_params + return get_optimizer(optimizer_name, optimizer_params, config.lr, model) + return optimizer @staticmethod - @abstractmethod - def get_optimizer(model: nn.Module, config: Coqpit) -> torch.optim.Optimizer: - pass + def get_lr(model: nn.Module, config: Coqpit) -> Union[float, List[float]]: + lr = None + if hasattr(model, "get_lr"): + lr = model.get_lr() + if lr is None: + lr = config.lr + return lr @staticmethod - @abstractmethod def get_scheduler( - config: Coqpit, optimizer: torch.optim.Optimizer - ) -> torch.optim.lr_scheduler._LRScheduler: # pylint: disable=protected-access - pass + model: nn.Module, config: Coqpit, optimizer: Union[torch.optim.Optimizer, List] + ) -> Union[torch.optim.lr_scheduler._LRScheduler, List]: # pylint: disable=protected-access + scheduler = None + if hasattr(model, "get_scheduler"): + scheduler = model.get_scheduler(optimizer) + if scheduler is None: + lr_scheduler = config.lr_scheduler + lr_scheduler_params = config.lr_scheduler_params + return get_scheduler(lr_scheduler, lr_scheduler_params, optimizer) + return scheduler @staticmethod - @abstractmethod - def get_criterion(config: Coqpit) -> nn.Module: - pass + def get_criterion(model: nn.Module) -> nn.Module: + criterion = None + criterion = model.get_criterion() + return criterion - @abstractmethod - def restore_model(self, *args, **kwargs) -> Tuple: - pass - @abstractmethod - def get_train_dataloader(self, *args, **kwargs) -> _DataLoader: - pass +def init_arguments(): + train_config = TrainingArgs() + parser = train_config.init_argparse(arg_prefix="") + return parser - @abstractmethod - def get_eval_dataloder(self, *args, **kwargs) -> _DataLoader: - pass - @abstractmethod - def format_batch(self, batch: List) -> Dict: - pass +def get_last_checkpoint(path): + """Get latest checkpoint or/and best model in path. - @abstractmethod - def _train_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: - pass + It is based on globbing for `*.pth.tar` and the RegEx + `(checkpoint|best_model)_([0-9]+)`. - @abstractmethod - def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_time: float) -> Tuple[Dict, Dict]: - pass + Args: + path (list): Path to files to be compared. - @abstractmethod - def train_epoch(self) -> None: - pass + Raises: + ValueError: If no checkpoint or best_model files are found. - @abstractmethod - def _eval_step(self, batch: Dict) -> Tuple[Dict, Dict]: - pass + Returns: + last_checkpoint (str): Last checkpoint filename. + """ + file_names = glob.glob(os.path.join(path, "*.pth.tar")) + last_models = {} + last_model_nums = {} + for key in ["checkpoint", "best_model"]: + last_model_num = None + last_model = None + # pass all the checkpoint files and find + # the one with the largest model number suffix. + for file_name in file_names: + match = re.search(f"{key}_([0-9]+)", file_name) + if match is not None: + model_num = int(match.groups()[0]) + if last_model_num is None or model_num > last_model_num: + last_model_num = model_num + last_model = file_name - @abstractmethod - def eval_step(self, batch: Dict, step: int) -> Tuple[Dict, Dict]: - pass + # if there is not checkpoint found above + # find the checkpoint with the latest + # modification date. + key_file_names = [fn for fn in file_names if key in fn] + if last_model is None and len(key_file_names) > 0: + last_model = max(key_file_names, key=os.path.getctime) + last_model_num = torch.load(last_model)["step"] - @abstractmethod - def eval_epoch(self) -> None: - pass + if last_model is not None: + last_models[key] = last_model + last_model_nums[key] = last_model_num - @abstractmethod - def test_run(self) -> None: - pass + # check what models were found + if not last_models: + raise ValueError(f"No models found in continue path {path}!") + if "checkpoint" not in last_models: # no checkpoint just best model + last_models["checkpoint"] = last_models["best_model"] + elif "best_model" not in last_models: # no best model + # this shouldn't happen, but let's handle it just in case + last_models["best_model"] = None + # finally check if last best model is more recent than checkpoint + elif last_model_nums["best_model"] > last_model_nums["checkpoint"]: + last_models["checkpoint"] = last_models["best_model"] - @abstractmethod - def fit(self) -> None: - pass + return last_models["checkpoint"], last_models["best_model"] - @abstractmethod - def save_best_model(self) -> None: - pass - @abstractmethod - def on_epoch_start(self) -> None: - pass +def process_args(args, config=None): + """Process parsed comand line arguments. - @abstractmethod - def on_epoch_end(self) -> None: - pass + Args: + args (argparse.Namespace or dict like): Parsed input arguments. + config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None. - @abstractmethod - def on_train_step_start(self) -> None: - pass + Returns: + c (TTS.utils.io.AttrDict): Config paramaters. + out_path (str): Path to save models and logging. + audio_path (str): Path to save generated test audios. + c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does + logging to the console. + tb_logger (TTS.utils.tensorboard.TensorboardLogger): Class that does + the TensorBoard loggind. + """ + if isinstance(args, tuple): + args, coqpit_overrides = args + if args.continue_path: + # continue a previous training from its output folder + experiment_path = args.continue_path + args.config_path = os.path.join(args.continue_path, "config.json") + args.restore_path, best_model = get_last_checkpoint(args.continue_path) + if not args.best_path: + args.best_path = best_model + # setup output paths and read configs + if config is None: + config = load_config(args.config_path) + # override values from command-line args + config.parse_known_args(coqpit_overrides, relaxed_parser=True) + if config.mixed_precision: + print(" > Mixed precision mode is ON") + experiment_path = args.continue_path + if not experiment_path: + experiment_path = create_experiment_folder(config.output_path, config.run_name) + audio_path = os.path.join(experiment_path, "test_audios") + # setup rank 0 process in distributed training + tb_logger = None + if args.rank == 0: + os.makedirs(audio_path, exist_ok=True) + new_fields = {} + if args.restore_path: + new_fields["restore_path"] = args.restore_path + new_fields["github_branch"] = get_git_branch() + # if model characters are not set in the config file + # save the default set to the config file for future + # compatibility. + if config.has("characters_config"): + used_characters = parse_symbols() + new_fields["characters"] = used_characters + copy_model_files(config, experiment_path, new_fields) + os.chmod(audio_path, 0o775) + os.chmod(experiment_path, 0o775) + tb_logger = TensorboardLogger(experiment_path, model_name=config.model) + # write model desc to tensorboard + tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) + c_logger = ConsoleLogger() + return config, experiment_path, audio_path, c_logger, tb_logger - @abstractmethod - def on_train_step_end(self) -> None: - pass + +def init_training(argv: Union[List, Coqpit], config: Coqpit = None): + """Initialization of a training run.""" + if isinstance(argv, Coqpit): + parser = argv.init_argparse(arg_prefix="") + else: + parser = init_arguments() + args = parser.parse_known_args() + config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(args, config) + return args[0], config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py deleted file mode 100644 index 705ea5bc..00000000 --- a/TTS/tts/models/tacotron_abstract.py +++ /dev/null @@ -1,245 +0,0 @@ -import copy -from abc import ABC, abstractmethod -from typing import Dict - -import torch -from torch import nn - -from TTS.tts.utils.data import sequence_mask -from TTS.utils.generic_utils import format_aux_input -from TTS.utils.training import gradual_training_scheduler - - -class TacotronAbstract(ABC, nn.Module): - def __init__( - self, - num_chars, - num_speakers, - r, - postnet_output_dim=80, - decoder_output_dim=80, - attn_type="original", - attn_win=False, - attn_norm="softmax", - prenet_type="original", - prenet_dropout=True, - prenet_dropout_at_inference=False, - forward_attn=False, - trans_agent=False, - forward_attn_mask=False, - location_attn=True, - attn_K=5, - separate_stopnet=True, - bidirectional_decoder=False, - double_decoder_consistency=False, - ddc_r=None, - encoder_in_features=512, - decoder_in_features=512, - d_vector_dim=None, - use_gst=False, - gst=None, - gradual_training=None, - ): - """Abstract Tacotron class""" - super().__init__() - self.num_chars = num_chars - self.r = r - self.decoder_output_dim = decoder_output_dim - self.postnet_output_dim = postnet_output_dim - self.use_gst = use_gst - self.gst = gst - self.num_speakers = num_speakers - self.bidirectional_decoder = bidirectional_decoder - self.double_decoder_consistency = double_decoder_consistency - self.ddc_r = ddc_r - self.attn_type = attn_type - self.attn_win = attn_win - self.attn_norm = attn_norm - self.prenet_type = prenet_type - self.prenet_dropout = prenet_dropout - self.prenet_dropout_at_inference = prenet_dropout_at_inference - self.forward_attn = forward_attn - self.trans_agent = trans_agent - self.forward_attn_mask = forward_attn_mask - self.location_attn = location_attn - self.attn_K = attn_K - self.separate_stopnet = separate_stopnet - self.encoder_in_features = encoder_in_features - self.decoder_in_features = decoder_in_features - self.d_vector_dim = d_vector_dim - self.gradual_training = gradual_training - - # layers - self.embedding = None - self.encoder = None - self.decoder = None - self.postnet = None - - # multispeaker - if self.d_vector_dim is None: - # if d_vector_dim is None we need use the nn.Embedding, with default d_vector_dim - self.use_d_vectors = False - else: - # if d_vector_dim is not None we need use speaker embedding per sample - self.use_d_vectors = True - - # global style token - if self.gst and use_gst: - self.decoder_in_features += self.gst.gst_embedding_dim # add gst embedding dim - self.gst_layer = None - - # model states - self.embedded_speakers = None - self.embedded_speakers_projected = None - - # additional layers - self.decoder_backward = None - self.coarse_decoder = None - - @staticmethod - def _format_aux_input(aux_input: Dict) -> Dict: - return format_aux_input({"d_vectors": None, "speaker_ids": None}, aux_input) - - ############################# - # INIT FUNCTIONS - ############################# - - def _init_states(self): - self.embedded_speakers = None - self.embedded_speakers_projected = None - - def _init_backward_decoder(self): - self.decoder_backward = copy.deepcopy(self.decoder) - - def _init_coarse_decoder(self): - self.coarse_decoder = copy.deepcopy(self.decoder) - self.coarse_decoder.r_init = self.ddc_r - self.coarse_decoder.set_r(self.ddc_r) - - ############################# - # CORE FUNCTIONS - ############################# - - @abstractmethod - def forward(self): - pass - - @abstractmethod - def inference(self): - pass - - def load_checkpoint( - self, config, checkpoint_path, eval=False - ): # pylint: disable=unused-argument, redefined-builtin - state = torch.load(checkpoint_path, map_location=torch.device("cpu")) - self.load_state_dict(state["model"]) - self.decoder.set_r(state["r"]) - if eval: - self.eval() - assert not self.training - - ############################# - # COMMON COMPUTE FUNCTIONS - ############################# - - def compute_masks(self, text_lengths, mel_lengths): - """Compute masks against sequence paddings.""" - # B x T_in_max (boolean) - input_mask = sequence_mask(text_lengths) - output_mask = None - if mel_lengths is not None: - max_len = mel_lengths.max() - r = self.decoder.r - max_len = max_len + (r - (max_len % r)) if max_len % r > 0 else max_len - output_mask = sequence_mask(mel_lengths, max_len=max_len) - return input_mask, output_mask - - def _backward_pass(self, mel_specs, encoder_outputs, mask): - """Run backwards decoder""" - decoder_outputs_b, alignments_b, _ = self.decoder_backward( - encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask - ) - decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous() - return decoder_outputs_b, alignments_b - - def _coarse_decoder_pass(self, mel_specs, encoder_outputs, alignments, input_mask): - """Double Decoder Consistency""" - T = mel_specs.shape[1] - if T % self.coarse_decoder.r > 0: - padding_size = self.coarse_decoder.r - (T % self.coarse_decoder.r) - mel_specs = torch.nn.functional.pad(mel_specs, (0, 0, 0, padding_size, 0, 0)) - decoder_outputs_backward, alignments_backward, _ = self.coarse_decoder( - encoder_outputs.detach(), mel_specs, input_mask - ) - # scale_factor = self.decoder.r_init / self.decoder.r - alignments_backward = torch.nn.functional.interpolate( - alignments_backward.transpose(1, 2), size=alignments.shape[1], mode="nearest" - ).transpose(1, 2) - decoder_outputs_backward = decoder_outputs_backward.transpose(1, 2) - decoder_outputs_backward = decoder_outputs_backward[:, :T, :] - return decoder_outputs_backward, alignments_backward - - ############################# - # EMBEDDING FUNCTIONS - ############################# - - def compute_speaker_embedding(self, speaker_ids): - """Compute speaker embedding vectors""" - if hasattr(self, "speaker_embedding") and speaker_ids is None: - raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided") - if hasattr(self, "speaker_embedding") and speaker_ids is not None: - self.embedded_speakers = self.speaker_embedding(speaker_ids).unsqueeze(1) - if hasattr(self, "speaker_project_mel") and speaker_ids is not None: - self.embedded_speakers_projected = self.speaker_project_mel(self.embedded_speakers).squeeze(1) - - def compute_gst(self, inputs, style_input, speaker_embedding=None): - """Compute global style token""" - if isinstance(style_input, dict): - query = torch.zeros(1, 1, self.gst.gst_embedding_dim // 2).type_as(inputs) - if speaker_embedding is not None: - query = torch.cat([query, speaker_embedding.reshape(1, 1, -1)], dim=-1) - - _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) - gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).type_as(inputs) - for k_token, v_amplifier in style_input.items(): - key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) - gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) - gst_outputs = gst_outputs + gst_outputs_att * v_amplifier - elif style_input is None: - gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).type_as(inputs) - else: - gst_outputs = self.gst_layer(style_input, speaker_embedding) # pylint: disable=not-callable - inputs = self._concat_speaker_embedding(inputs, gst_outputs) - return inputs - - @staticmethod - def _add_speaker_embedding(outputs, embedded_speakers): - embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1) - outputs = outputs + embedded_speakers_ - return outputs - - @staticmethod - def _concat_speaker_embedding(outputs, embedded_speakers): - embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1) - outputs = torch.cat([outputs, embedded_speakers_], dim=-1) - return outputs - - ############################# - # CALLBACKS - ############################# - - def on_epoch_start(self, trainer): - """Callback for setting values wrt gradual training schedule. - - Args: - trainer (TrainerTTS): TTS trainer object that is used to train this model. - """ - if self.gradual_training: - r, trainer.config.batch_size = gradual_training_scheduler(trainer.total_steps_done, trainer.config) - trainer.config.r = r - self.decoder.set_r(r) - if trainer.config.bidirectional_decoder: - trainer.model.decoder_backward.set_r(r) - trainer.train_loader = trainer.setup_train_dataloader(self.ap, self.model.decoder.r, verbose=True) - trainer.eval_loader = trainer.setup_eval_dataloder(self.ap, self.model.decoder.r) - print(f"\n > Number of output frames: {self.decoder.r}") diff --git a/TTS/tts/trainer_tts.py b/TTS/tts/trainer_tts.py deleted file mode 100644 index 6c900120..00000000 --- a/TTS/tts/trainer_tts.py +++ /dev/null @@ -1,709 +0,0 @@ -# -*- coding: utf-8 -*- - -import importlib -import logging -import os -import time -from argparse import Namespace -from typing import Dict, List, Tuple, Union - -import torch -from coqpit import Coqpit - -# DISTRIBUTED -from torch import nn -from torch.nn.parallel import DistributedDataParallel as DDP_th -from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler - -from TTS.trainer import TrainerAbstract -from TTS.tts.datasets import TTSDataset, load_meta_data -from TTS.tts.layers import setup_loss -from TTS.tts.models import setup_model -from TTS.tts.utils.io import save_best_model, save_checkpoint -from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager -from TTS.tts.utils.synthesis import synthesis -from TTS.tts.utils.text.symbols import make_symbols -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import init_distributed -from TTS.utils.generic_utils import KeepAverage, count_parameters, set_init_dict, to_cuda -from TTS.utils.logging import ConsoleLogger, TensorboardLogger -from TTS.utils.training import check_update, setup_torch_training_env - - -# pylint: disable=import-outside-toplevel, too-many-public-methods - -class TrainerTTS(TrainerAbstract): - use_cuda, num_gpus = setup_torch_training_env(True, False) - - def __init__( - self, - args: Union[Coqpit, Namespace], - config: Coqpit, - c_logger: ConsoleLogger = None, - tb_logger: TensorboardLogger = None, - model: nn.Module = None, - output_path: str = None, - ) -> None: - self.args = args - self.config = config - self.c_logger = ConsoleLogger() if c_logger is None else c_logger - if tb_logger is None: - self.tb_logger = TensorboardLogger(output_path, model_name=config.model) - self.tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) - else: - self.tb_logger = tb_logger - self.output_path = output_path - - self.total_steps_done = 0 - self.epochs_done = 0 - self.restore_step = 0 - self.best_loss = float("inf") - self.train_loader = None - self.eval_loader = None - self.output_audio_path = os.path.join(output_path, "test_audios") - - self.keep_avg_train = None - self.keep_avg_eval = None - - log_file = os.path.join(self.output_path, f"trainer_{args.rank}_log.txt") - self._setup_logger_config(log_file) - - # model, audio processor, datasets, loss - # init audio processor - self.ap = AudioProcessor(**self.config.audio.to_dict()) - - # init character processor - self.model_characters = self.get_character_processor(self.config) - - # load dataset samples - self.data_train, self.data_eval = load_meta_data(self.config.datasets) - - # default speaker manager - self.speaker_manager = self.get_speaker_manager(self.config, args.restore_path, output_path, self.data_train) - - # init TTS model - if model is not None: - self.model = model - else: - self.model = self.get_model( - len(self.model_characters), - self.speaker_manager.num_speakers, - self.config, - self.speaker_manager.d_vector_dim if self.speaker_manager.d_vectors else None, - ) - - # setup criterion - self.criterion = self.get_criterion(self.config) - - # DISTRUBUTED - if self.num_gpus > 1: - init_distributed( - args.rank, - self.num_gpus, - args.group_id, - self.config.distributed_backend, - self.config.distributed_url, - ) - - if self.use_cuda: - self.model.cuda() - self.criterion.cuda() - - # scalers for mixed precision training - self.scaler = torch.cuda.amp.GradScaler() if self.config.mixed_precision and self.use_cuda else None - - # setup optimizer - self.optimizer = self.get_optimizer(self.model, self.config) - - if self.args.restore_path: - self.model, self.optimizer, self.scaler, self.restore_step = self.restore_model( - self.config, args.restore_path, self.model, self.optimizer, self.scaler - ) - - # setup scheduler - self.scheduler = self.get_scheduler(self.config, self.optimizer) - - # DISTRUBUTED - if self.num_gpus > 1: - self.model = DDP_th(self.model, device_ids=[args.rank], output_device=args.rank) - - # count model size - num_params = count_parameters(self.model) - print("\n > Model has {} parameters".format(num_params)) - - @staticmethod - def get_model(num_chars: int, num_speakers: int, config: Coqpit, d_vector_dim: int) -> nn.Module: - model = setup_model(num_chars, num_speakers, config, d_vector_dim) - return model - - @staticmethod - def get_optimizer(model: nn.Module, config: Coqpit) -> torch.optim.Optimizer: - optimizer_name = config.optimizer - optimizer_params = config.optimizer_params - if optimizer_name.lower() == "radam": - module = importlib.import_module("TTS.utils.radam") - optimizer = getattr(module, "RAdam") - else: - optimizer = getattr(torch.optim, optimizer_name) - return optimizer(model.parameters(), lr=config.lr, **optimizer_params) - - @staticmethod - def get_character_processor(config: Coqpit) -> str: - # setup custom characters if set in config file. - # TODO: implement CharacterProcessor - if config.characters is not None: - symbols, phonemes = make_symbols(**config.characters.to_dict()) - else: - from TTS.tts.utils.text.symbols import phonemes, symbols - model_characters = phonemes if config.use_phonemes else symbols - return model_characters - - @staticmethod - def get_speaker_manager( - config: Coqpit, restore_path: str = "", out_path: str = "", data_train: List = None - ) -> SpeakerManager: - speaker_manager = get_speaker_manager(config, restore_path, data_train, out_path) - return speaker_manager - - @staticmethod - def get_scheduler( - config: Coqpit, optimizer: torch.optim.Optimizer - ) -> torch.optim.lr_scheduler._LRScheduler: # pylint: disable=protected-access - lr_scheduler = config.lr_scheduler - lr_scheduler_params = config.lr_scheduler_params - if lr_scheduler is None: - return None - if lr_scheduler.lower() == "noamlr": - from TTS.utils.training import NoamLR - - scheduler = NoamLR - else: - scheduler = getattr(torch.optim, lr_scheduler) - return scheduler(optimizer, **lr_scheduler_params) - - @staticmethod - def get_criterion(config: Coqpit) -> nn.Module: - return setup_loss(config) - - def restore_model( - self, - config: Coqpit, - restore_path: str, - model: nn.Module, - optimizer: torch.optim.Optimizer, - scaler: torch.cuda.amp.GradScaler = None, - ) -> Tuple[nn.Module, torch.optim.Optimizer, torch.cuda.amp.GradScaler, int]: - print(" > Restoring from %s ..." % os.path.basename(restore_path)) - checkpoint = torch.load(restore_path) - try: - print(" > Restoring Model...") - model.load_state_dict(checkpoint["model"]) - print(" > Restoring Optimizer...") - optimizer.load_state_dict(checkpoint["optimizer"]) - if "scaler" in checkpoint and config.mixed_precision: - print(" > Restoring AMP Scaler...") - scaler.load_state_dict(checkpoint["scaler"]) - except (KeyError, RuntimeError): - print(" > Partial model initialization...") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], config) - model.load_state_dict(model_dict) - del model_dict - - for group in optimizer.param_groups: - group["lr"] = self.config.lr - print( - " > Model restored from step %d" % checkpoint["step"], - ) - restore_step = checkpoint["step"] - return model, optimizer, scaler, restore_step - - def _get_loader( - self, - r: int, - ap: AudioProcessor, - is_eval: bool, - data_items: List, - verbose: bool, - speaker_ids: Union[Dict, List], - d_vectors: Union[Dict, List], - ) -> DataLoader: - if is_eval and not self.config.run_eval: - loader = None - else: - dataset = TTSDataset( - outputs_per_step=r, - text_cleaner=self.config.text_cleaner, - compute_linear_spec=self.config.model.lower() == "tacotron", - meta_data=data_items, - ap=ap, - tp=self.config.characters, - add_blank=self.config["add_blank"], - batch_group_size=0 if is_eval else self.config.batch_group_size * self.config.batch_size, - min_seq_len=self.config.min_seq_len, - max_seq_len=self.config.max_seq_len, - phoneme_cache_path=self.config.phoneme_cache_path, - use_phonemes=self.config.use_phonemes, - phoneme_language=self.config.phoneme_language, - enable_eos_bos=self.config.enable_eos_bos_chars, - use_noise_augment=not is_eval, - verbose=verbose, - speaker_id_mapping=speaker_ids if self.config.use_speaker_embedding else None, - d_vector_mapping=d_vectors - if self.config.use_speaker_embedding and self.config.use_external_speaker_embedding_file - else None, - ) - - if self.config.use_phonemes and self.config.compute_input_seq_cache: - # precompute phonemes to have a better estimate of sequence lengths. - dataset.compute_input_seq(self.config.num_loader_workers) - dataset.sort_items() - - sampler = DistributedSampler(dataset) if self.num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=self.config.eval_batch_size if is_eval else self.config.batch_size, - shuffle=False, - collate_fn=dataset.collate_fn, - drop_last=False, - sampler=sampler, - num_workers=self.config.num_val_loader_workers if is_eval else self.config.num_loader_workers, - pin_memory=False, - ) - return loader - - def get_train_dataloader( - self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_ids: Dict, d_vectors: Dict - ) -> DataLoader: - return self._get_loader(r, ap, False, data_items, verbose, speaker_ids, d_vectors) - - def get_eval_dataloder( - self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_ids: Dict, d_vectors: Dict - ) -> DataLoader: - return self._get_loader(r, ap, True, data_items, verbose, speaker_ids, d_vectors) - - def format_batch(self, batch: List) -> Dict: - # setup input batch - text_input = batch[0] - text_lengths = batch[1] - speaker_names = batch[2] - linear_input = batch[3] if self.config.model.lower() in ["tacotron"] else None - mel_input = batch[4] - mel_lengths = batch[5] - stop_targets = batch[6] - item_idx = batch[7] - d_vectors = batch[8] - speaker_ids = batch[9] - attn_mask = batch[10] - max_text_length = torch.max(text_lengths.float()) - max_spec_length = torch.max(mel_lengths.float()) - - # compute durations from attention masks - durations = None - if attn_mask is not None: - durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2]) - for idx, am in enumerate(attn_mask): - # compute raw durations - c_idxs = am[:, : text_lengths[idx], : mel_lengths[idx]].max(1)[1] - # c_idxs, counts = torch.unique_consecutive(c_idxs, return_counts=True) - c_idxs, counts = torch.unique(c_idxs, return_counts=True) - dur = torch.ones([text_lengths[idx]]).to(counts.dtype) - dur[c_idxs] = counts - # smooth the durations and set any 0 duration to 1 - # by cutting off from the largest duration indeces. - extra_frames = dur.sum() - mel_lengths[idx] - largest_idxs = torch.argsort(-dur)[:extra_frames] - dur[largest_idxs] -= 1 - assert ( - dur.sum() == mel_lengths[idx] - ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" - durations[idx, : text_lengths[idx]] = dur - - # set stop targets view, we predict a single stop token per iteration. - stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // self.config.r, -1) - stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) - - # dispatch batch to GPU - if self.use_cuda: - text_input = to_cuda(text_input) - text_lengths = to_cuda(text_lengths) - mel_input = to_cuda(mel_input) - mel_lengths = to_cuda(mel_lengths) - linear_input = to_cuda(linear_input) if self.config.model.lower() in ["tacotron"] else None - stop_targets = to_cuda(stop_targets) - attn_mask = to_cuda(attn_mask) if attn_mask is not None else None - durations = to_cuda(durations) if attn_mask is not None else None - if speaker_ids is not None: - speaker_ids = to_cuda(speaker_ids) - if d_vectors is not None: - d_vectors = to_cuda(d_vectors) - - return { - "text_input": text_input, - "text_lengths": text_lengths, - "speaker_names": speaker_names, - "mel_input": mel_input, - "mel_lengths": mel_lengths, - "linear_input": linear_input, - "stop_targets": stop_targets, - "attn_mask": attn_mask, - "durations": durations, - "speaker_ids": speaker_ids, - "d_vectors": d_vectors, - "max_text_length": max_text_length, - "max_spec_length": max_spec_length, - "item_idx": item_idx, - } - - def _train_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: - if hasattr(self.model, "module"): - return self.model.module.train_step(batch, criterion) - return self.model.train_step(batch, criterion) - - def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_time: float) -> Tuple[Dict, Dict]: - self.on_train_step_start() - step_start_time = time.time() - - # format data - batch = self.format_batch(batch) - loader_time = time.time() - loader_start_time - - # zero-out optimizer - self.optimizer.zero_grad() - - with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): - outputs, loss_dict = self._train_step(batch, self.criterion) - - # check nan loss - if torch.isnan(loss_dict["loss"]).any(): - raise RuntimeError(f"Detected NaN loss at step {self.total_steps_done}.") - - # optimizer step - if self.config.mixed_precision: - # model optimizer step in mixed precision mode - self.scaler.scale(loss_dict["loss"]).backward() - self.scaler.unscale_(self.optimizer) - grad_norm, _ = check_update(self.model, self.config.grad_clip, ignore_stopnet=True) - self.scaler.step(self.optimizer) - self.scaler.update() - else: - # main model optimizer step - loss_dict["loss"].backward() - grad_norm, _ = check_update(self.model, self.config.grad_clip, ignore_stopnet=True) - self.optimizer.step() - - step_time = time.time() - step_start_time - - # setup lr - if self.config.lr_scheduler: - self.scheduler.step() - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - self.keep_avg_train.update_values(update_train_values) - - # print training progress - current_lr = self.optimizer.param_groups[0]["lr"] - if self.total_steps_done % self.config.print_step == 0: - log_dict = { - "max_spec_length": [batch["max_spec_length"], 1], # value, precision - "max_text_length": [batch["max_text_length"], 1], - "step_time": [step_time, 4], - "loader_time": [loader_time, 2], - "current_lr": current_lr, - } - self.c_logger.print_train_step( - batch_n_steps, step, self.total_steps_done, log_dict, loss_dict, self.keep_avg_train.avg_values - ) - - if self.args.rank == 0: - # Plot Training Iter Stats - # reduce TB load - if self.total_steps_done % self.config.tb_plot_step == 0: - iter_stats = { - "lr": current_lr, - "grad_norm": grad_norm, - "step_time": step_time, - } - iter_stats.update(loss_dict) - self.tb_logger.tb_train_step_stats(self.total_steps_done, iter_stats) - - if self.total_steps_done % self.config.save_step == 0: - if self.config.checkpoint: - # save model - save_checkpoint( - self.model, - self.optimizer, - self.total_steps_done, - self.epochs_done, - self.config.r, - self.output_path, - model_loss=loss_dict["loss"], - characters=self.model_characters, - scaler=self.scaler.state_dict() if self.config.mixed_precision else None, - ) - # training visualizations - if hasattr(self.model, "module"): - figures, audios = self.model.module.train_log(self.ap, batch, outputs) - else: - figures, audios = self.model.train_log(self.ap, batch, outputs) - self.tb_logger.tb_train_figures(self.total_steps_done, figures) - self.tb_logger.tb_train_audios(self.total_steps_done, {"TrainAudio": audios}, self.ap.sample_rate) - self.total_steps_done += 1 - self.on_train_step_end() - return outputs, loss_dict - - def train_epoch(self) -> None: - self.model.train() - epoch_start_time = time.time() - if self.use_cuda: - batch_num_steps = int(len(self.train_loader.dataset) / (self.config.batch_size * self.num_gpus)) - else: - batch_num_steps = int(len(self.train_loader.dataset) / self.config.batch_size) - self.c_logger.print_train_start() - for cur_step, batch in enumerate(self.train_loader): - loader_start_time = time.time() - _, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time) - epoch_time = time.time() - epoch_start_time - # Plot self.epochs_done Stats - if self.args.rank == 0: - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(self.keep_avg_train.avg_values) - self.tb_logger.tb_train_epoch_stats(self.total_steps_done, epoch_stats) - if self.config.tb_model_param_stats: - self.tb_logger.tb_model_weights(self.model, self.total_steps_done) - - def _eval_step(self, batch: Dict) -> Tuple[Dict, Dict]: - if hasattr(self.model, "module"): - return self.model.module.eval_step(batch, self.criterion) - return self.model.eval_step(batch, self.criterion) - - def eval_step(self, batch: Dict, step: int) -> Tuple[Dict, Dict]: - with torch.no_grad(): - step_start_time = time.time() - - with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): - outputs, loss_dict = self._eval_step(batch) - - step_time = time.time() - step_start_time - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_eval_values = dict() - for key, value in loss_dict.items(): - update_eval_values["avg_" + key] = value - update_eval_values["avg_step_time"] = step_time - self.keep_avg_eval.update_values(update_eval_values) - - if self.config.print_eval: - self.c_logger.print_eval_step(step, loss_dict, self.keep_avg_eval.avg_values) - return outputs, loss_dict - - def eval_epoch(self) -> None: - self.model.eval() - self.c_logger.print_eval_start() - loader_start_time = time.time() - batch = None - for cur_step, batch in enumerate(self.eval_loader): - # format data - batch = self.format_batch(batch) - loader_time = time.time() - loader_start_time - self.keep_avg_eval.update_values({"avg_loader_time": loader_time}) - outputs, _ = self.eval_step(batch, cur_step) - # Plot epoch stats and samples from the last batch. - if self.args.rank == 0: - if hasattr(self.model, "module"): - figures, eval_audios = self.model.module.eval_log(self.ap, batch, outputs) - else: - figures, eval_audios = self.model.eval_log(self.ap, batch, outputs) - self.tb_logger.tb_eval_figures(self.total_steps_done, figures) - self.tb_logger.tb_eval_audios(self.total_steps_done, {"EvalAudio": eval_audios}, self.ap.sample_rate) - - def test_run( - self, - ) -> None: - print(" | > Synthesizing test sentences.") - test_audios = {} - test_figures = {} - test_sentences = self.config.test_sentences - aux_inputs = self._get_aux_inputs() - for idx, sen in enumerate(test_sentences): - wav, alignment, model_outputs, _ = synthesis( - self.model, - sen, - self.config, - self.use_cuda, - self.ap, - speaker_id=aux_inputs["speaker_id"], - d_vector=aux_inputs["d_vector"], - style_wav=aux_inputs["style_wav"], - enable_eos_bos_chars=self.config.enable_eos_bos_chars, - use_griffin_lim=True, - do_trim_silence=False, - ).values() - - file_path = os.path.join(self.output_audio_path, str(self.total_steps_done)) - os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) - self.ap.save_wav(wav, file_path) - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-prediction".format(idx)] = plot_spectrogram(model_outputs, self.ap, output_fig=False) - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment, output_fig=False) - - self.tb_logger.tb_test_audios(self.total_steps_done, test_audios, self.config.audio["sample_rate"]) - self.tb_logger.tb_test_figures(self.total_steps_done, test_figures) - - def _get_aux_inputs(self) -> Dict: - # setup speaker_id - speaker_id = 0 if self.config.use_speaker_embedding else None - # setup d_vector - d_vector = ( - self.speaker_manager.get_d_vectors_by_speaker(self.speaker_manager.speaker_names[0]) - if self.config.use_external_speaker_embedding_file and self.config.use_speaker_embedding - else None - ) - # setup style_mel - if self.config.has("gst_style_input"): - style_wav = self.config.gst_style_input - else: - style_wav = None - if style_wav is None and "use_gst" in self.config and self.config.use_gst: - # inicialize GST with zero dict. - style_wav = {} - print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") - for i in range(self.config.gst["gst_num_style_tokens"]): - style_wav[str(i)] = 0 - aux_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector} - return aux_inputs - - def fit(self) -> None: - if self.restore_step != 0 or self.args.best_path: - print(" > Restoring best loss from " f"{os.path.basename(self.args.best_path)} ...") - self.best_loss = torch.load(self.args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {self.best_loss}.") - - # define data loaders - self.train_loader = self.get_train_dataloader( - self.config.r, - self.ap, - self.data_train, - verbose=True, - speaker_ids=self.speaker_manager.speaker_ids, - d_vectors=self.speaker_manager.d_vectors, - ) - self.eval_loader = ( - self.get_eval_dataloder( - self.config.r, - self.ap, - self.data_train, - verbose=True, - speaker_ids=self.speaker_manager.speaker_ids, - d_vectors=self.speaker_manager.d_vectors, - ) - if self.config.run_eval - else None - ) - - self.total_steps_done = self.restore_step - - for epoch in range(0, self.config.epochs): - self.on_epoch_start() - self.keep_avg_train = KeepAverage() - self.keep_avg_eval = KeepAverage() if self.config.run_eval else None - self.epochs_done = epoch - self.c_logger.print_epoch_start(epoch, self.config.epochs) - self.train_epoch() - if self.config.run_eval: - self.eval_epoch() - if epoch >= self.config.test_delay_epochs and self.args.rank < 0: - self.test_run() - self.c_logger.print_epoch_end( - epoch, self.keep_avg_eval.avg_values if self.config.run_eval else self.keep_avg_train.avg_values - ) - self.save_best_model() - self.on_epoch_end() - - def save_best_model(self) -> None: - self.best_loss = save_best_model( - self.keep_avg_eval["avg_loss"] if self.keep_avg_eval else self.keep_avg_train["avg_loss"], - self.best_loss, - self.model, - self.optimizer, - self.total_steps_done, - self.epochs_done, - self.config.r, - self.output_path, - self.model_characters, - keep_all_best=self.config.keep_all_best, - keep_after=self.config.keep_after, - scaler=self.scaler.state_dict() if self.config.mixed_precision else None, - ) - - @staticmethod - def _setup_logger_config(log_file: str) -> None: - logging.basicConfig( - level=logging.INFO, format="", handlers=[logging.FileHandler(log_file), logging.StreamHandler()] - ) - - def on_epoch_start(self) -> None: # pylint: disable=no-self-use - if hasattr(self.model, "on_epoch_start"): - self.model.on_epoch_start(self) - - if hasattr(self.criterion, "on_epoch_start"): - self.criterion.on_epoch_start(self) - - if hasattr(self.optimizer, "on_epoch_start"): - self.optimizer.on_epoch_start(self) - - def on_epoch_end(self) -> None: # pylint: disable=no-self-use - if hasattr(self.model, "on_epoch_end"): - self.model.on_epoch_end(self) - - if hasattr(self.criterion, "on_epoch_end"): - self.criterion.on_epoch_end(self) - - if hasattr(self.optimizer, "on_epoch_end"): - self.optimizer.on_epoch_end(self) - - def on_train_step_start(self) -> None: # pylint: disable=no-self-use - if hasattr(self.model, "on_train_step_start"): - self.model.on_train_step_start(self) - - if hasattr(self.criterion, "on_train_step_start"): - self.criterion.on_train_step_start(self) - - if hasattr(self.optimizer, "on_train_step_start"): - self.optimizer.on_train_step_start(self) - - def on_train_step_end(self) -> None: # pylint: disable=no-self-use - if hasattr(self.model, "on_train_step_end"): - self.model.on_train_step_end(self) - - if hasattr(self.criterion, "on_train_step_end"): - self.criterion.on_train_step_end(self) - - if hasattr(self.optimizer, "on_train_step_end"): - self.optimizer.on_train_step_end(self) diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py deleted file mode 100644 index 9d92ae82..00000000 --- a/TTS/utils/arguments.py +++ /dev/null @@ -1,182 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -"""Argument parser for training scripts.""" - -import argparse -import glob -import os -import re - -import torch - -from TTS.config import load_config -from TTS.tts.utils.text.symbols import parse_symbols -from TTS.utils.generic_utils import create_experiment_folder, get_git_branch -from TTS.utils.io import copy_model_files -from TTS.utils.logging import ConsoleLogger, TensorboardLogger - - -def init_arguments(argv): - """Parse command line arguments of training scripts. - - Args: - argv (list): This is a list of input arguments as given by sys.argv - - Returns: - argparse.Namespace: Parsed arguments. - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "--continue_path", - type=str, - help=( - "Training output folder to continue training. Used to continue " - "a training. If it is used, 'config_path' is ignored." - ), - default="", - required="--config_path" not in argv, - ) - parser.add_argument( - "--restore_path", type=str, help="Model file to be restored. Use to finetune a model.", default="" - ) - parser.add_argument( - "--best_path", - type=str, - help=( - "Best model file to be used for extracting best loss." - "If not specified, the latest best model in continue path is used" - ), - default="", - ) - parser.add_argument( - "--config_path", type=str, help="Path to config file for training.", required="--continue_path" not in argv - ) - parser.add_argument("--debug", type=bool, default=False, help="Do not verify commit integrity to run training.") - parser.add_argument("--rank", type=int, default=0, help="DISTRIBUTED: process rank for distributed training.") - parser.add_argument("--group_id", type=str, default="", help="DISTRIBUTED: process group id.") - - return parser - - -def get_last_checkpoint(path): - """Get latest checkpoint or/and best model in path. - - It is based on globbing for `*.pth.tar` and the RegEx - `(checkpoint|best_model)_([0-9]+)`. - - Args: - path (list): Path to files to be compared. - - Raises: - ValueError: If no checkpoint or best_model files are found. - - Returns: - last_checkpoint (str): Last checkpoint filename. - """ - file_names = glob.glob(os.path.join(path, "*.pth.tar")) - last_models = {} - last_model_nums = {} - for key in ["checkpoint", "best_model"]: - last_model_num = None - last_model = None - # pass all the checkpoint files and find - # the one with the largest model number suffix. - for file_name in file_names: - match = re.search(f"{key}_([0-9]+)", file_name) - if match is not None: - model_num = int(match.groups()[0]) - if last_model_num is None or model_num > last_model_num: - last_model_num = model_num - last_model = file_name - - # if there is not checkpoint found above - # find the checkpoint with the latest - # modification date. - key_file_names = [fn for fn in file_names if key in fn] - if last_model is None and len(key_file_names) > 0: - last_model = max(key_file_names, key=os.path.getctime) - last_model_num = torch.load(last_model)["step"] - - if last_model is not None: - last_models[key] = last_model - last_model_nums[key] = last_model_num - - # check what models were found - if not last_models: - raise ValueError(f"No models found in continue path {path}!") - if "checkpoint" not in last_models: # no checkpoint just best model - last_models["checkpoint"] = last_models["best_model"] - elif "best_model" not in last_models: # no best model - # this shouldn't happen, but let's handle it just in case - last_models["best_model"] = None - # finally check if last best model is more recent than checkpoint - elif last_model_nums["best_model"] > last_model_nums["checkpoint"]: - last_models["checkpoint"] = last_models["best_model"] - - return last_models["checkpoint"], last_models["best_model"] - - -def process_args(args): - """Process parsed comand line arguments. - - Args: - args (argparse.Namespace or dict like): Parsed input arguments. - - Returns: - c (TTS.utils.io.AttrDict): Config paramaters. - out_path (str): Path to save models and logging. - audio_path (str): Path to save generated test audios. - c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does - logging to the console. - tb_logger (TTS.utils.tensorboard.TensorboardLogger): Class that does - the TensorBoard loggind. - """ - if isinstance(args, tuple): - args, coqpit_overrides = args - if args.continue_path: - # continue a previous training from its output folder - experiment_path = args.continue_path - args.config_path = os.path.join(args.continue_path, "config.json") - args.restore_path, best_model = get_last_checkpoint(args.continue_path) - if not args.best_path: - args.best_path = best_model - # setup output paths and read configs - config = load_config(args.config_path) - # override values from command-line args - config.parse_known_args(coqpit_overrides, relaxed_parser=True) - if config.mixed_precision: - print(" > Mixed precision mode is ON") - experiment_path = args.continue_path - if not experiment_path: - experiment_path = create_experiment_folder(config.output_path, config.run_name, args.debug) - audio_path = os.path.join(experiment_path, "test_audios") - # setup rank 0 process in distributed training - tb_logger = None - if args.rank == 0: - os.makedirs(audio_path, exist_ok=True) - new_fields = {} - if args.restore_path: - new_fields["restore_path"] = args.restore_path - new_fields["github_branch"] = get_git_branch() - # if model characters are not set in the config file - # save the default set to the config file for future - # compatibility. - if config.has("characters_config"): - used_characters = parse_symbols() - new_fields["characters"] = used_characters - copy_model_files(config, experiment_path, new_fields) - os.chmod(audio_path, 0o775) - os.chmod(experiment_path, 0o775) - tb_logger = TensorboardLogger(experiment_path, model_name=config.model) - # write model desc to tensorboard - tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) - c_logger = ConsoleLogger() - return config, experiment_path, audio_path, c_logger, tb_logger - - -def init_training(argv): - """Initialization of a training run.""" - parser = init_arguments(argv) - args = parser.parse_known_args() - config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(args) - return args[0], config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger diff --git a/TTS/utils/callbacks.py b/TTS/utils/callbacks.py new file mode 100644 index 00000000..18b6c34c --- /dev/null +++ b/TTS/utils/callbacks.py @@ -0,0 +1,75 @@ +class TrainerCallback: + def __init__(self, trainer): + super().__init__() + self.trainer = trainer + + def on_init_start(self) -> None: + if hasattr(self.trainer.model, "on_init_start"): + self.trainer.model.on_init_start(self.trainer) + + if hasattr(self.trainer.criterion, "on_init_start"): + self.trainer.criterion.on_init_start(self.trainer) + + if hasattr(self.trainer.optimizer, "on_init_start"): + self.trainer.optimizer.on_init_start(self.trainer) + + def on_init_end(self) -> None: + if hasattr(self.trainer.model, "on_init_end"): + self.trainer.model.on_init_end(self.trainer) + + if hasattr(self.trainer.criterion, "on_init_end"): + self.trainer.criterion.on_init_end(self.trainer) + + if hasattr(self.trainer.optimizer, "on_init_end"): + self.trainer.optimizer.on_init_end(self.trainer) + + def on_epoch_start(self) -> None: + if hasattr(self.trainer.model, "on_epoch_start"): + self.trainer.model.on_epoch_start(self.trainer) + + if hasattr(self.trainer.criterion, "on_epoch_start"): + self.trainer.criterion.on_epoch_start(self.trainer) + + if hasattr(self.trainer.optimizer, "on_epoch_start"): + self.trainer.optimizer.on_epoch_start(self.trainer) + + def on_epoch_end(self) -> None: + if hasattr(self.trainer.model, "on_epoch_end"): + self.trainer.model.on_epoch_end(self.trainer) + + if hasattr(self.trainer.criterion, "on_epoch_end"): + self.trainer.criterion.on_epoch_end(self.trainer) + + if hasattr(self.trainer.optimizer, "on_epoch_end"): + self.trainer.optimizer.on_epoch_end(self.trainer) + + def on_train_step_start(self) -> None: + if hasattr(self.trainer.model, "on_train_step_start"): + self.trainer.model.on_train_step_start(self.trainer) + + if hasattr(self.trainer.criterion, "on_train_step_start"): + self.trainer.criterion.on_train_step_start(self.trainer) + + if hasattr(self.trainer.optimizer, "on_train_step_start"): + self.trainer.optimizer.on_train_step_start(self.trainer) + + def on_train_step_end(self) -> None: + + if hasattr(self.trainer.model, "on_train_step_end"): + self.trainer.model.on_train_step_end(self.trainer) + + if hasattr(self.trainer.criterion, "on_train_step_end"): + self.trainer.criterion.on_train_step_end(self.trainer) + + if hasattr(self.trainer.optimizer, "on_train_step_end"): + self.trainer.optimizer.on_train_step_end(self.trainer) + + def on_keyboard_interrupt(self) -> None: + if hasattr(self.trainer.model, "on_keyboard_interrupt"): + self.trainer.model.on_keyboard_interrupt(self.trainer) + + if hasattr(self.trainer.criterion, "on_keyboard_interrupt"): + self.trainer.criterion.on_keyboard_interrupt(self.trainer) + + if hasattr(self.trainer.optimizer, "on_keyboard_interrupt"): + self.trainer.optimizer.on_keyboard_interrupt(self.trainer) diff --git a/TTS/utils/distribute.py b/TTS/utils/distribute.py index 7a1078e8..1c6b0e1c 100644 --- a/TTS/utils/distribute.py +++ b/TTS/utils/distribute.py @@ -1,53 +1,8 @@ # edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py -import math - import torch import torch.distributed as dist from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors from torch.autograd import Variable -from torch.utils.data.sampler import Sampler - - -class DistributedSampler(Sampler): - """ - Non shuffling Distributed Sampler - """ - - def __init__(self, dataset, num_replicas=None, rank=None): - super().__init__(dataset) - if num_replicas is None: - if not dist.is_available(): - raise RuntimeError("Requires distributed package to be available") - num_replicas = dist.get_world_size() - if rank is None: - if not dist.is_available(): - raise RuntimeError("Requires distributed package to be available") - rank = dist.get_rank() - self.dataset = dataset - self.num_replicas = num_replicas - self.rank = rank - self.epoch = 0 - self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) - self.total_size = self.num_samples * self.num_replicas - - def __iter__(self): - indices = torch.arange(len(self.dataset)).tolist() - - # add extra samples to make it evenly divisible - indices += indices[: (self.total_size - len(indices))] - assert len(indices) == self.total_size - - # subsample - indices = indices[self.rank : self.total_size : self.num_replicas] - assert len(indices) == self.num_samples - - return iter(indices) - - def __len__(self): - return self.num_samples - - def set_epoch(self, epoch): - self.epoch = epoch def reduce_tensor(tensor, num_gpus): diff --git a/TTS/utils/trainer_utils.py b/TTS/utils/trainer_utils.py new file mode 100644 index 00000000..02e68905 --- /dev/null +++ b/TTS/utils/trainer_utils.py @@ -0,0 +1,65 @@ +import importlib +from typing import Dict + +import torch + +from TTS.utils.training import NoamLR + + +def is_apex_available(): + return importlib.util.find_spec("apex") is not None + + +def setup_torch_training_env(cudnn_enable, cudnn_benchmark): + torch.backends.cudnn.enabled = cudnn_enable + torch.backends.cudnn.benchmark = cudnn_benchmark + torch.manual_seed(54321) + use_cuda = torch.cuda.is_available() + num_gpus = torch.cuda.device_count() + print(" > Using CUDA: ", use_cuda) + print(" > Number of GPUs: ", num_gpus) + return use_cuda, num_gpus + + +def get_scheduler( + lr_scheduler: str, lr_scheduler_params: Dict, optimizer: torch.optim.Optimizer +) -> torch.optim.lr_scheduler._LRScheduler: # pylint: disable=protected-access + """Find, initialize and return a scheduler. + + Args: + lr_scheduler (str): Scheduler name. + lr_scheduler_params (Dict): Scheduler parameters. + optimizer (torch.optim.Optimizer): Optimizer to pass to the scheduler. + + Returns: + torch.optim.lr_scheduler._LRScheduler: Functional scheduler. + """ + if lr_scheduler is None: + return None + if lr_scheduler.lower() == "noamlr": + scheduler = NoamLR + else: + scheduler = getattr(torch.optim.lr_scheduler, lr_scheduler) + return scheduler(optimizer, **lr_scheduler_params) + + +def get_optimizer( + optimizer_name: str, optimizer_params: dict, lr: float, model: torch.nn.Module +) -> torch.optim.Optimizer: + """Find, initialize and return a optimizer. + + Args: + optimizer_name (str): Optimizer name. + optimizer_params (dict): Optimizer parameters. + lr (float): Initial learning rate. + model (torch.nn.Module): Model to pass to the optimizer. + + Returns: + torch.optim.Optimizer: Functional optimizer. + """ + if optimizer_name.lower() == "radam": + module = importlib.import_module("TTS.utils.radam") + optimizer = getattr(module, "RAdam") + else: + optimizer = getattr(torch.optim, optimizer_name) + return optimizer(model.parameters(), lr=lr, **optimizer_params) diff --git a/TTS/utils/training.py b/TTS/utils/training.py index 37b32637..aa5651c5 100644 --- a/TTS/utils/training.py +++ b/TTS/utils/training.py @@ -2,17 +2,6 @@ import numpy as np import torch -def setup_torch_training_env(cudnn_enable, cudnn_benchmark): - torch.backends.cudnn.enabled = cudnn_enable - torch.backends.cudnn.benchmark = cudnn_benchmark - torch.manual_seed(54321) - use_cuda = torch.cuda.is_available() - num_gpus = torch.cuda.device_count() - print(" > Using CUDA: ", use_cuda) - print(" > Number of GPUs: ", num_gpus) - return use_cuda, num_gpus - - def check_update(model, grad_clip, ignore_stopnet=False, amp_opt_params=None): r"""Check model gradient against unexpected jumps and failures""" skip_flag = False @@ -41,46 +30,6 @@ def check_update(model, grad_clip, ignore_stopnet=False, amp_opt_params=None): return grad_norm, skip_flag -def lr_decay(init_lr, global_step, warmup_steps): - r"""from https://github.com/r9y9/tacotron_pytorch/blob/master/train.py""" - warmup_steps = float(warmup_steps) - step = global_step + 1.0 - lr = init_lr * warmup_steps ** 0.5 * np.minimum(step * warmup_steps ** -1.5, step ** -0.5) - return lr - - -def adam_weight_decay(optimizer): - """ - Custom weight decay operation, not effecting grad values. - """ - for group in optimizer.param_groups: - for param in group["params"]: - current_lr = group["lr"] - weight_decay = group["weight_decay"] - factor = -weight_decay * group["lr"] - param.data = param.data.add(param.data, alpha=factor) - return optimizer, current_lr - - -# pylint: disable=dangerous-default-value -def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v", "rnn", "lstm", "gru", "embedding"}): - """ - Skip biases, BatchNorm parameters, rnns. - and attention projection layer v - """ - decay = [] - no_decay = [] - for name, param in model.named_parameters(): - if not param.requires_grad: - continue - - if len(param.shape) == 1 or any((skip_name in name for skip_name in skip_list)): - no_decay.append(param) - else: - decay.append(param) - return [{"params": no_decay, "weight_decay": 0.0}, {"params": decay, "weight_decay": weight_decay}] - - # pylint: disable=protected-access class NoamLR(torch.optim.lr_scheduler._LRScheduler): def __init__(self, optimizer, warmup_steps=0.1, last_epoch=-1): @@ -107,3 +56,31 @@ def gradual_training_scheduler(global_step, config): if global_step * num_gpus >= values[0]: new_values = values return new_values[1], new_values[2] + + +def lr_decay(init_lr, global_step, warmup_steps): + r"""from https://github.com/r9y9/tacotron_pytorch/blob/master/train.py + It is only being used by the Speaker Encoder trainer.""" + warmup_steps = float(warmup_steps) + step = global_step + 1.0 + lr = init_lr * warmup_steps ** 0.5 * np.minimum(step * warmup_steps ** -1.5, step ** -0.5) + return lr + + +# pylint: disable=dangerous-default-value +def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v", "rnn", "lstm", "gru", "embedding"}): + """ + Skip biases, BatchNorm parameters, rnns. + and attention projection layer v + """ + decay = [] + no_decay = [] + for name, param in model.named_parameters(): + if not param.requires_grad: + continue + + if len(param.shape) == 1 or any((skip_name in name for skip_name in skip_list)): + no_decay.append(param) + else: + decay.append(param) + return [{"params": no_decay, "weight_decay": 0.0}, {"params": decay, "weight_decay": weight_decay}] From acd96a4940887c193a84f9b600f1f9d7ae0aec86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:23:08 +0200 Subject: [PATCH 081/258] Implement unified IO utils --- TTS/tts/utils/io.py | 120 ------------------------------------- TTS/utils/io.py | 121 +++++++++++++++++++++++++++++++++++++ TTS/vocoder/utils/io.py | 128 ---------------------------------------- 3 files changed, 121 insertions(+), 248 deletions(-) delete mode 100644 TTS/tts/utils/io.py delete mode 100644 TTS/vocoder/utils/io.py diff --git a/TTS/tts/utils/io.py b/TTS/tts/utils/io.py deleted file mode 100644 index bb8432fa..00000000 --- a/TTS/tts/utils/io.py +++ /dev/null @@ -1,120 +0,0 @@ -import datetime -import os -import pickle as pickle_tts - -import torch - -from TTS.utils.io import RenamingUnpickler - - -def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False, eval=False): # pylint: disable=redefined-builtin - """Load ```TTS.tts.models``` checkpoints. - - Args: - model (TTS.tts.models): model object to load the weights for. - checkpoint_path (string): checkpoint file path. - amp (apex.amp, optional): Apex amp abject to load apex related state vars. Defaults to None. - use_cuda (bool, optional): load model to GPU if True. Defaults to False. - - Returns: - [type]: [description] - """ - try: - state = torch.load(checkpoint_path, map_location=torch.device("cpu")) - except ModuleNotFoundError: - pickle_tts.Unpickler = RenamingUnpickler - state = torch.load(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts) - model.load_state_dict(state["model"]) - if amp and "amp" in state: - amp.load_state_dict(state["amp"]) - if use_cuda: - model.cuda() - # set model stepsize - if hasattr(model.decoder, "r"): - model.decoder.set_r(state["r"]) - print(" > Model r: ", state["r"]) - if eval: - model.eval() - return model, state - - -def save_model(model, optimizer, current_step, epoch, r, output_path, characters, amp_state_dict=None, **kwargs): - """Save ```TTS.tts.models``` states with extra fields. - - Args: - model (TTS.tts.models.Model): models object to be saved. - optimizer (torch.optim.optimizers.Optimizer): model optimizer used for training. - current_step (int): current number of training steps. - epoch (int): current number of training epochs. - r (int): model reduction rate for Tacotron models. - output_path (str): output path to save the model file. - characters (list): list of characters used in the model. - amp_state_dict (state_dict, optional): Apex.amp state dict if Apex is enabled. Defaults to None. - """ - if hasattr(model, "module"): - model_state = model.module.state_dict() - else: - model_state = model.state_dict() - state = { - "model": model_state, - "optimizer": optimizer.state_dict() if optimizer is not None else None, - "step": current_step, - "epoch": epoch, - "date": datetime.date.today().strftime("%B %d, %Y"), - "r": r, - "characters": characters, - } - if amp_state_dict: - state["amp"] = amp_state_dict - state.update(kwargs) - torch.save(state, output_path) - - -def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, characters, **kwargs): - """Save model checkpoint, intended for saving checkpoints at training. - - Args: - model (TTS.tts.models.Model): models object to be saved. - optimizer (torch.optim.optimizers.Optimizer): model optimizer used for training. - current_step (int): current number of training steps. - epoch (int): current number of training epochs. - r (int): model reduction rate for Tacotron models. - output_path (str): output path to save the model file. - characters (list): list of characters used in the model. - """ - file_name = "checkpoint_{}.pth.tar".format(current_step) - checkpoint_path = os.path.join(output_folder, file_name) - print(" > CHECKPOINT : {}".format(checkpoint_path)) - save_model(model, optimizer, current_step, epoch, r, checkpoint_path, characters, **kwargs) - - -def save_best_model( - target_loss, best_loss, model, optimizer, current_step, epoch, r, output_folder, characters, **kwargs -): - """Save model checkpoint, intended for saving the best model after each epoch. - It compares the current model loss with the best loss so far and saves the - model if the current loss is better. - - Args: - target_loss (float): current model loss. - best_loss (float): best loss so far. - model (TTS.tts.models.Model): models object to be saved. - optimizer (torch.optim.optimizers.Optimizer): model optimizer used for training. - current_step (int): current number of training steps. - epoch (int): current number of training epochs. - r (int): model reduction rate for Tacotron models. - output_path (str): output path to save the model file. - characters (list): list of characters used in the model. - - Returns: - float: updated current best loss. - """ - if target_loss < best_loss: - file_name = "best_model.pth.tar" - checkpoint_path = os.path.join(output_folder, file_name) - print(" >> BEST MODEL : {}".format(checkpoint_path)) - save_model( - model, optimizer, current_step, epoch, r, checkpoint_path, characters, model_loss=target_loss, **kwargs - ) - best_loss = target_loss - return best_loss diff --git a/TTS/utils/io.py b/TTS/utils/io.py index 62d972f1..871cff6c 100644 --- a/TTS/utils/io.py +++ b/TTS/utils/io.py @@ -1,7 +1,12 @@ +import datetime +import glob import os import pickle as pickle_tts from shutil import copyfile +import torch +from coqpit import Coqpit + class RenamingUnpickler(pickle_tts.Unpickler): """Overload default pickler to solve module renaming problem""" @@ -41,3 +46,119 @@ def copy_model_files(config, out_path, new_fields): config.audio.stats_path, copy_stats_path, ) + + +def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False): # pylint: disable=redefined-builtin + try: + state = torch.load(checkpoint_path, map_location=torch.device("cpu")) + except ModuleNotFoundError: + pickle_tts.Unpickler = RenamingUnpickler + state = torch.load(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts) + model.load_state_dict(state["model"]) + if use_cuda: + model.cuda() + if eval: + model.eval() + return model, state + + +def save_model(config, model, optimizer, scaler, current_step, epoch, output_path, **kwargs): + if hasattr(model, "module"): + model_state = model.module.state_dict() + else: + model_state = model.state_dict() + if isinstance(optimizer, list): + optimizer_state = [optim.state_dict() for optim in optimizer] + else: + optimizer_state = optimizer.state_dict() if optimizer is not None else None + + if isinstance(scaler, list): + scaler_state = [s.state_dict() for s in scaler] + else: + scaler_state = scaler.state_dict() if scaler is not None else None + + if isinstance(config, Coqpit): + config = config.to_dict() + + state = { + "config": config, + "model": model_state, + "optimizer": optimizer_state, + "scaler": scaler_state, + "step": current_step, + "epoch": epoch, + "date": datetime.date.today().strftime("%B %d, %Y"), + } + state.update(kwargs) + torch.save(state, output_path) + + +def save_checkpoint( + config, + model, + optimizer, + scaler, + current_step, + epoch, + output_folder, + **kwargs, +): + file_name = "checkpoint_{}.pth.tar".format(current_step) + checkpoint_path = os.path.join(output_folder, file_name) + print("\n > CHECKPOINT : {}".format(checkpoint_path)) + save_model( + config, + model, + optimizer, + scaler, + current_step, + epoch, + checkpoint_path, + **kwargs, + ) + + +def save_best_model( + current_loss, + best_loss, + config, + model, + optimizer, + scaler, + current_step, + epoch, + out_path, + keep_all_best=False, + keep_after=10000, + **kwargs, +): + if current_loss < best_loss: + best_model_name = f"best_model_{current_step}.pth.tar" + checkpoint_path = os.path.join(out_path, best_model_name) + print(" > BEST MODEL : {}".format(checkpoint_path)) + save_model( + config, + model, + optimizer, + scaler, + current_step, + epoch, + checkpoint_path, + model_loss=current_loss, + **kwargs, + ) + # only delete previous if current is saved successfully + if not keep_all_best or (current_step < keep_after): + model_names = glob.glob(os.path.join(out_path, "best_model*.pth.tar")) + for model_name in model_names: + if os.path.basename(model_name) == best_model_name: + continue + os.remove(model_name) + # create symlink to best model for convinience + link_name = "best_model.pth.tar" + link_path = os.path.join(out_path, link_name) + if os.path.islink(link_path) or os.path.isfile(link_path): + os.remove(link_path) + os.symlink(best_model_name, os.path.join(out_path, link_name)) + best_loss = current_loss + return best_loss diff --git a/TTS/vocoder/utils/io.py b/TTS/vocoder/utils/io.py deleted file mode 100644 index 9c67535f..00000000 --- a/TTS/vocoder/utils/io.py +++ /dev/null @@ -1,128 +0,0 @@ -import datetime -import glob -import os -import pickle as pickle_tts - -import torch - -from TTS.utils.io import RenamingUnpickler - - -def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False): # pylint: disable=redefined-builtin - try: - state = torch.load(checkpoint_path, map_location=torch.device("cpu")) - except ModuleNotFoundError: - pickle_tts.Unpickler = RenamingUnpickler - state = torch.load(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts) - model.load_state_dict(state["model"]) - if use_cuda: - model.cuda() - if eval: - model.eval() - return model, state - - -def save_model( - model, optimizer, scheduler, model_disc, optimizer_disc, scheduler_disc, current_step, epoch, output_path, **kwargs -): - if hasattr(model, "module"): - model_state = model.module.state_dict() - else: - model_state = model.state_dict() - model_disc_state = model_disc.state_dict() if model_disc is not None else None - optimizer_state = optimizer.state_dict() if optimizer is not None else None - optimizer_disc_state = optimizer_disc.state_dict() if optimizer_disc is not None else None - scheduler_state = scheduler.state_dict() if scheduler is not None else None - scheduler_disc_state = scheduler_disc.state_dict() if scheduler_disc is not None else None - state = { - "model": model_state, - "optimizer": optimizer_state, - "scheduler": scheduler_state, - "model_disc": model_disc_state, - "optimizer_disc": optimizer_disc_state, - "scheduler_disc": scheduler_disc_state, - "step": current_step, - "epoch": epoch, - "date": datetime.date.today().strftime("%B %d, %Y"), - } - state.update(kwargs) - torch.save(state, output_path) - - -def save_checkpoint( - model, - optimizer, - scheduler, - model_disc, - optimizer_disc, - scheduler_disc, - current_step, - epoch, - output_folder, - **kwargs, -): - file_name = "checkpoint_{}.pth.tar".format(current_step) - checkpoint_path = os.path.join(output_folder, file_name) - print(" > CHECKPOINT : {}".format(checkpoint_path)) - save_model( - model, - optimizer, - scheduler, - model_disc, - optimizer_disc, - scheduler_disc, - current_step, - epoch, - checkpoint_path, - **kwargs, - ) - - -def save_best_model( - current_loss, - best_loss, - model, - optimizer, - scheduler, - model_disc, - optimizer_disc, - scheduler_disc, - current_step, - epoch, - out_path, - keep_all_best=False, - keep_after=10000, - **kwargs, -): - if current_loss < best_loss: - best_model_name = f"best_model_{current_step}.pth.tar" - checkpoint_path = os.path.join(out_path, best_model_name) - print(" > BEST MODEL : {}".format(checkpoint_path)) - save_model( - model, - optimizer, - scheduler, - model_disc, - optimizer_disc, - scheduler_disc, - current_step, - epoch, - checkpoint_path, - model_loss=current_loss, - **kwargs, - ) - # only delete previous if current is saved successfully - if not keep_all_best or (current_step < keep_after): - model_names = glob.glob(os.path.join(out_path, "best_model*.pth.tar")) - for model_name in model_names: - if os.path.basename(model_name) == best_model_name: - continue - os.remove(model_name) - # create symlink to best model for convinience - link_name = "best_model.pth.tar" - link_path = os.path.join(out_path, link_name) - if os.path.islink(link_path) or os.path.isfile(link_path): - os.remove(link_path) - os.symlink(best_model_name, os.path.join(out_path, link_name)) - best_loss = current_loss - return best_loss From 76e590f33c107469ddc05c71242d09c23b5282c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:24:48 +0200 Subject: [PATCH 082/258] Update model test configs --- tests/inputs/test_align_tts.json | 6 +++--- tests/inputs/test_glow_tts.json | 6 +++--- tests/inputs/test_speedy_speech.json | 6 +++--- tests/inputs/test_tacotron2_config.json | 6 +++--- tests/inputs/test_tacotron_bd_config.json | 6 +++--- tests/inputs/test_tacotron_config.json | 6 +++--- tests/inputs/test_vocoder_multiband_melgan_config.json | 2 +- tests/inputs/test_vocoder_wavegrad.json | 4 ++-- tests/inputs/test_vocoder_wavernn_config.json | 4 ++-- 9 files changed, 23 insertions(+), 23 deletions(-) diff --git a/tests/inputs/test_align_tts.json b/tests/inputs/test_align_tts.json index 964cc66d..a0d677ad 100644 --- a/tests/inputs/test_align_tts.json +++ b/tests/inputs/test_align_tts.json @@ -123,7 +123,7 @@ "text_cleaner": "english_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "min_seq_len": 2, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 300, // DATASET-RELATED: maximum text length @@ -140,8 +140,8 @@ // MULTI-SPEAKER and GST "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. - "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 - "external_speaker_embedding_file": "/home/erogol/Data/libritts/speakers.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "use_d_vector_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "d_vector_file": "/home/erogol/Data/libritts/speakers.json", // if not null and use_d_vector_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 // DATASETS diff --git a/tests/inputs/test_glow_tts.json b/tests/inputs/test_glow_tts.json index 64cc3822..6dd86057 100644 --- a/tests/inputs/test_glow_tts.json +++ b/tests/inputs/test_glow_tts.json @@ -115,7 +115,7 @@ "text_cleaner": "phoneme_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "min_seq_len": 3, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 500, // DATASET-RELATED: maximum text length @@ -132,8 +132,8 @@ "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages // MULTI-SPEAKER and GST - "use_external_speaker_embedding_file": false, - "external_speaker_embedding_file": null, + "use_d_vector_file": false, + "d_vector_file": null, "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. // DATASETS diff --git a/tests/inputs/test_speedy_speech.json b/tests/inputs/test_speedy_speech.json index a29fc992..02783d21 100644 --- a/tests/inputs/test_speedy_speech.json +++ b/tests/inputs/test_speedy_speech.json @@ -120,7 +120,7 @@ "text_cleaner": "english_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "min_seq_len": 2, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 300, // DATASET-RELATED: maximum text length @@ -137,8 +137,8 @@ // MULTI-SPEAKER and GST "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. - "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 - "external_speaker_embedding_file": "/home/erogol/Data/libritts/speakers.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "use_d_vector_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "d_vector_file": "/home/erogol/Data/libritts/speakers.json", // if not null and use_d_vector_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 // DATASETS diff --git a/tests/inputs/test_tacotron2_config.json b/tests/inputs/test_tacotron2_config.json index cc2c1bb5..6c82891d 100644 --- a/tests/inputs/test_tacotron2_config.json +++ b/tests/inputs/test_tacotron2_config.json @@ -130,7 +130,7 @@ "text_cleaner": "phoneme_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 153, // DATASET-RELATED: maximum text length @@ -145,8 +145,8 @@ "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages // MULTI-SPEAKER and GST - "use_external_speaker_embedding_file": false, - "external_speaker_embedding_file": null, + "use_d_vector_file": false, + "d_vector_file": null, "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. "use_gst": true, // use global style tokens "gst": { // gst parameter if gst is enabled diff --git a/tests/inputs/test_tacotron_bd_config.json b/tests/inputs/test_tacotron_bd_config.json index 9d2935aa..fbf3c001 100644 --- a/tests/inputs/test_tacotron_bd_config.json +++ b/tests/inputs/test_tacotron_bd_config.json @@ -130,7 +130,7 @@ "text_cleaner": "phoneme_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 153, // DATASET-RELATED: maximum text length @@ -145,8 +145,8 @@ "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages // MULTI-SPEAKER and GST - "use_external_speaker_embedding_file": false, - "external_speaker_embedding_file": null, + "use_d_vector_file": false, + "d_vector_file": null, "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. "use_gst": true, // use global style tokens "gst": { // gst parameter if gst is enabled diff --git a/tests/inputs/test_tacotron_config.json b/tests/inputs/test_tacotron_config.json index c8fae623..b60ed35e 100644 --- a/tests/inputs/test_tacotron_config.json +++ b/tests/inputs/test_tacotron_config.json @@ -130,7 +130,7 @@ "text_cleaner": "phoneme_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 153, // DATASET-RELATED: maximum text length @@ -145,8 +145,8 @@ "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages // MULTI-SPEAKER and GST - "use_external_speaker_embedding_file": false, - "external_speaker_embedding_file": null, + "use_d_vector_file": false, + "d_vector_file": null, "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. "use_gst": true, // use global style tokens "gst": { // gst parameter if gst is enabled diff --git a/tests/inputs/test_vocoder_multiband_melgan_config.json b/tests/inputs/test_vocoder_multiband_melgan_config.json index 794a3fcc..b8b192e4 100644 --- a/tests/inputs/test_vocoder_multiband_melgan_config.json +++ b/tests/inputs/test_vocoder_multiband_melgan_config.json @@ -157,7 +157,7 @@ // DATA LOADING "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "eval_split_size": 10, // PATHS diff --git a/tests/inputs/test_vocoder_wavegrad.json b/tests/inputs/test_vocoder_wavegrad.json index f6208e8d..6378c07a 100644 --- a/tests/inputs/test_vocoder_wavegrad.json +++ b/tests/inputs/test_vocoder_wavegrad.json @@ -88,7 +88,7 @@ // OPTIMIZER "epochs": 1, // total number of epochs to train. - "clip_grad": 1.0, // Generator gradient clipping threshold. Apply gradient clipping if > 0 + "grad_clip": 1.0, // Generator gradient clipping threshold. Apply gradient clipping if > 0 "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate "lr_scheduler_params": { "gamma": 0.5, @@ -107,7 +107,7 @@ // DATA LOADING "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "eval_split_size": 4, // PATHS diff --git a/tests/inputs/test_vocoder_wavernn_config.json b/tests/inputs/test_vocoder_wavernn_config.json index decafa70..ee4e5f8e 100644 --- a/tests/inputs/test_vocoder_wavernn_config.json +++ b/tests/inputs/test_vocoder_wavernn_config.json @@ -55,7 +55,7 @@ "padding": 2, // pad the input for resnet to see wider input length // GENERATOR - for backward compatibility - "generator_model": "WaveRNN", + "generator_model": "Wavernn", // DATASET //"use_gta": true, // use computed gta features from the tts model @@ -103,7 +103,7 @@ // DATA LOADING "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "eval_split_size": 10, // number of samples for testing // PATHS From 20081c5396c326ac3c3cc8ab2e1f828b2be795e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:27:19 +0200 Subject: [PATCH 083/258] Update tests for the new trainer API --- tests/inference_tests/test_synthesizer.py | 12 +- tests/test_extract_tts_spectrograms.py | 10 +- tests/tts_tests/test_align_tts_train.py | 2 +- tests/tts_tests/test_glow_tts.py | 89 +------------- tests/tts_tests/test_glow_tts_train.py | 2 +- tests/tts_tests/test_speedy_speech_layers.py | 27 +++- tests/tts_tests/test_speedy_speech_train.py | 4 +- .../test_tacotron2_d-vectors_train.py | 6 +- tests/tts_tests/test_tacotron2_model.py | 61 ++++++---- .../test_tacotron2_speaker_emb_train.py | 2 +- tests/tts_tests/test_tacotron2_tf_model.py | 2 +- tests/tts_tests/test_tacotron2_train.py | 2 +- tests/tts_tests/test_tacotron_model.py | 115 ++++++------------ tests/tts_tests/test_tacotron_train.py | 2 +- .../test_fullband_melgan_train.py | 8 +- tests/vocoder_tests/test_hifigan_train.py | 8 +- tests/vocoder_tests/test_melgan_train.py | 8 +- .../test_multiband_melgan_train.py | 8 +- .../test_parallel_wavegan_train.py | 8 +- tests/vocoder_tests/test_vocoder_wavernn.py | 28 ++++- tests/vocoder_tests/test_wavegrad.py | 14 +-- tests/vocoder_tests/test_wavegrad_layers.py | 7 +- tests/vocoder_tests/test_wavegrad_train.py | 10 +- tests/vocoder_tests/test_wavernn_train.py | 11 +- 24 files changed, 174 insertions(+), 272 deletions(-) diff --git a/tests/inference_tests/test_synthesizer.py b/tests/inference_tests/test_synthesizer.py index 4379c8ca..5972dc90 100644 --- a/tests/inference_tests/test_synthesizer.py +++ b/tests/inference_tests/test_synthesizer.py @@ -3,8 +3,7 @@ import unittest from TTS.config import load_config from TTS.tts.models import setup_model -from TTS.tts.utils.io import save_checkpoint -from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols +from TTS.utils.io import save_checkpoint from TTS.utils.synthesizer import Synthesizer from .. import get_tests_output_path @@ -14,15 +13,10 @@ class SynthesizerTest(unittest.TestCase): # pylint: disable=R0201 def _create_random_model(self): # pylint: disable=global-statement - global symbols, phonemes config = load_config(os.path.join(get_tests_output_path(), "dummy_model_config.json")) - if config.has("characters") and config.characters: - symbols, phonemes = make_symbols(**config.characters.to_dict()) - - num_chars = len(phonemes) if config.use_phonemes else len(symbols) - model = setup_model(num_chars, 0, config) + model = setup_model(config) output_path = os.path.join(get_tests_output_path()) - save_checkpoint(model, None, 10, 10, 1, output_path, None) + save_checkpoint(config, model, None, None, 10, 1, output_path) def test_in_out(self): self._create_random_model() diff --git a/tests/test_extract_tts_spectrograms.py b/tests/test_extract_tts_spectrograms.py index d16167ed..8c795d58 100644 --- a/tests/test_extract_tts_spectrograms.py +++ b/tests/test_extract_tts_spectrograms.py @@ -6,7 +6,6 @@ import torch from tests import get_tests_input_path, get_tests_output_path, run_cli from TTS.config import load_config from TTS.tts.models import setup_model -from TTS.tts.utils.text.symbols import phonemes, symbols torch.manual_seed(1) @@ -21,8 +20,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): # load config c = load_config(config_path) # create model - num_chars = len(phonemes if c.use_phonemes else symbols) - model = setup_model(num_chars, 1, c, d_vector_dim=None) + model = setup_model(c) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test @@ -40,8 +38,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): # load config c = load_config(config_path) # create model - num_chars = len(phonemes if c.use_phonemes else symbols) - model = setup_model(num_chars, 1, c, d_vector_dim=None) + model = setup_model(c) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test @@ -59,8 +56,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): # load config c = load_config(config_path) # create model - num_chars = len(phonemes if c.use_phonemes else symbols) - model = setup_model(num_chars, 1, c, d_vector_dim=None) + model = setup_model(c) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test diff --git a/tests/tts_tests/test_align_tts_train.py b/tests/tts_tests/test_align_tts_train.py index 61d67c5c..3700b1d3 100644 --- a/tests/tts_tests/test_align_tts_train.py +++ b/tests/tts_tests/test_align_tts_train.py @@ -13,7 +13,7 @@ config = AlignTTSConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=False, phoneme_language="en-us", diff --git a/tests/tts_tests/test_glow_tts.py b/tests/tts_tests/test_glow_tts.py index 8a2a8fb3..171f2cdc 100644 --- a/tests/tts_tests/test_glow_tts.py +++ b/tests/tts_tests/test_glow_tts.py @@ -41,64 +41,11 @@ class GlowTTSTrainTest(unittest.TestCase): criterion = GlowTTSLoss() # model to train - model = GlowTTS( - num_chars=32, - hidden_channels_enc=48, - hidden_channels_dec=48, - hidden_channels_dp=32, - out_channels=80, - encoder_type="rel_pos_transformer", - encoder_params={ - "kernel_size": 3, - "dropout_p": 0.1, - "num_layers": 6, - "num_heads": 2, - "hidden_channels_ffn": 16, # 4 times the hidden_channels - "input_length": None, - }, - use_encoder_prenet=True, - num_flow_blocks_dec=12, - kernel_size_dec=5, - dilation_rate=1, - num_block_layers=4, - dropout_p_dec=0.0, - num_speakers=0, - c_in_channels=0, - num_splits=4, - num_squeeze=1, - sigmoid_scale=False, - mean_only=False, - ).to(device) + config = GlowTTSConfig(num_chars=32) + model = GlowTTS(config).to(device) # reference model to compare model weights - model_ref = GlowTTS( - num_chars=32, - hidden_channels_enc=48, - hidden_channels_dec=48, - hidden_channels_dp=32, - out_channels=80, - encoder_type="rel_pos_transformer", - encoder_params={ - "kernel_size": 3, - "dropout_p": 0.1, - "num_layers": 6, - "num_heads": 2, - "hidden_channels_ffn": 16, # 4 times the hidden_channels - "input_length": None, - }, - use_encoder_prenet=True, - num_flow_blocks_dec=12, - kernel_size_dec=5, - dilation_rate=1, - num_block_layers=4, - dropout_p_dec=0.0, - num_speakers=0, - c_in_channels=0, - num_splits=4, - num_squeeze=1, - sigmoid_scale=False, - mean_only=False, - ).to(device) + model_ref = GlowTTS(config).to(device) model.train() print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) @@ -149,34 +96,8 @@ class GlowTTSInferenceTest(unittest.TestCase): speaker_ids = torch.randint(0, 5, (8,)).long().to(device) # create model - model = GlowTTS( - num_chars=32, - hidden_channels_enc=48, - hidden_channels_dec=48, - hidden_channels_dp=32, - out_channels=80, - encoder_type="rel_pos_transformer", - encoder_params={ - "kernel_size": 3, - "dropout_p": 0.1, - "num_layers": 6, - "num_heads": 2, - "hidden_channels_ffn": 16, # 4 times the hidden_channels - "input_length": None, - }, - use_encoder_prenet=True, - num_flow_blocks_dec=12, - kernel_size_dec=5, - dilation_rate=1, - num_block_layers=4, - dropout_p_dec=0.0, - num_speakers=0, - c_in_channels=0, - num_splits=4, - num_squeeze=1, - sigmoid_scale=False, - mean_only=False, - ).to(device) + config = GlowTTSConfig(num_chars=32) + model = GlowTTS(config).to(device) model.eval() print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py index c4d57edd..e3601e67 100644 --- a/tests/tts_tests/test_glow_tts_train.py +++ b/tests/tts_tests/test_glow_tts_train.py @@ -13,7 +13,7 @@ config = GlowTTSConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=True, phoneme_language="zh-CN", diff --git a/tests/tts_tests/test_speedy_speech_layers.py b/tests/tts_tests/test_speedy_speech_layers.py index d2f62d49..a5c481f1 100644 --- a/tests/tts_tests/test_speedy_speech_layers.py +++ b/tests/tts_tests/test_speedy_speech_layers.py @@ -1,7 +1,8 @@ import torch +from TTS.tts.configs import SpeedySpeechConfig from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor -from TTS.tts.models.speedy_speech import SpeedySpeech +from TTS.tts.models.speedy_speech import SpeedySpeech, SpeedySpeechArgs from TTS.tts.utils.data import sequence_mask use_cuda = torch.cuda.is_available() @@ -40,7 +41,8 @@ def test_speedy_speech(): y_lengths = durations.sum(1) - model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128) + config = SpeedySpeechConfig(model_args=SpeedySpeechArgs(num_chars=num_chars, out_channels=80, hidden_channels=128)) + model = SpeedySpeech(config) if use_cuda: model.cuda() @@ -55,7 +57,12 @@ def test_speedy_speech(): assert list(o_dr.shape) == [B, T_en] # with speaker embedding - model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128, num_speakers=10, c_in_channels=256).to(device) + config = SpeedySpeechConfig( + model_args=SpeedySpeechArgs( + num_chars=num_chars, out_channels=80, hidden_channels=128, num_speakers=80, d_vector_dim=256 + ) + ) + model = SpeedySpeech(config).to(device) model.forward( x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.randint(0, 10, (B,)).to(device)} ) @@ -68,9 +75,17 @@ def test_speedy_speech(): assert list(o_dr.shape) == [B, T_en] # with speaker external embedding - model = SpeedySpeech( - num_chars, out_channels=80, hidden_channels=128, num_speakers=10, external_c=True, c_in_channels=256 - ).to(device) + config = SpeedySpeechConfig( + model_args=SpeedySpeechArgs( + num_chars=num_chars, + out_channels=80, + hidden_channels=128, + num_speakers=10, + use_d_vector=True, + d_vector_dim=256, + ) + ) + model = SpeedySpeech(config).to(device) model.forward(x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.rand((B, 256)).to(device)}) o_de = outputs["model_outputs"] attn = outputs["alignments"] diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index bf635bc9..5ce739a2 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -4,16 +4,18 @@ import shutil from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs import SpeedySpeechConfig +from TTS.tts.models.speedy_speech import SpeedySpeechArgs config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") config = SpeedySpeechConfig( + model_args=SpeedySpeechArgs(num_chars=50, out_channels=80, hidden_channels=128, num_speakers=0), batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=True, phoneme_language="zh-CN", diff --git a/tests/tts_tests/test_tacotron2_d-vectors_train.py b/tests/tts_tests/test_tacotron2_d-vectors_train.py index 7fda7e09..3313b8c4 100644 --- a/tests/tts_tests/test_tacotron2_d-vectors_train.py +++ b/tests/tts_tests/test_tacotron2_d-vectors_train.py @@ -13,7 +13,7 @@ config = Tacotron2Config( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=False, phoneme_language="en-us", @@ -24,11 +24,11 @@ config = Tacotron2Config( print_step=1, print_eval=True, use_speaker_embedding=True, - use_external_speaker_embedding_file=True, + use_d_vector_file=True, test_sentences=[ "Be a voice, not an echo.", ], - external_speaker_embedding_file="tests/data/ljspeech/speakers.json", + d_vector_file="tests/data/ljspeech/speakers.json", max_decoder_steps=50, ) diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index fc3d9799..a8132467 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -7,6 +7,7 @@ from torch import nn, optim from tests import get_tests_input_path from TTS.tts.configs import Tacotron2Config +from TTS.tts.configs.shared_configs import GSTConfig from TTS.tts.layers.losses import MSELossMasked from TTS.tts.models.tacotron2 import Tacotron2 from TTS.utils.audio import AudioProcessor @@ -17,19 +18,20 @@ torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -c = Tacotron2Config() +config_global = Tacotron2Config(num_chars=32, num_speakers=5, out_channels=80, decoder_output_dim=80) -ap = AudioProcessor(**c.audio) +ap = AudioProcessor(**config_global.audio) WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") class TacotronTrainTest(unittest.TestCase): def test_train_step(self): # pylint: disable=no-self-use + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -38,19 +40,19 @@ class TacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5).to(device) + model = Tacotron2(config).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for i in range(5): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} @@ -77,11 +79,12 @@ class TacotronTrainTest(unittest.TestCase): class MultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -90,19 +93,20 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55).to(device) + config.d_vector_dim = 55 + model = Tacotron2(config).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for i in range(5): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_ids} @@ -130,11 +134,12 @@ class TacotronGSTTrainTest(unittest.TestCase): # pylint: disable=no-self-use def test_train_step(self): # with random gst mel style + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -143,19 +148,21 @@ class TacotronGSTTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, use_gst=True, gst=c.gst).to(device) + config.use_gst = True + config.gst = GSTConfig() + model = Tacotron2(config).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for i in range(10): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} @@ -190,7 +197,7 @@ class TacotronGSTTrainTest(unittest.TestCase): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -199,19 +206,19 @@ class TacotronGSTTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, use_gst=True, gst=c.gst).to(device) + model = Tacotron2(config).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for i in range(10): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} @@ -242,11 +249,12 @@ class TacotronGSTTrainTest(unittest.TestCase): class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -255,18 +263,19 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55, use_gst=True, gst=c.gst).to(device) + config.d_vector_dim = 55 + model = Tacotron2(config).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for i in range(5): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings} diff --git a/tests/tts_tests/test_tacotron2_speaker_emb_train.py b/tests/tts_tests/test_tacotron2_speaker_emb_train.py index a242c724..41d694f6 100644 --- a/tests/tts_tests/test_tacotron2_speaker_emb_train.py +++ b/tests/tts_tests/test_tacotron2_speaker_emb_train.py @@ -13,7 +13,7 @@ config = Tacotron2Config( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=False, phoneme_language="en-us", diff --git a/tests/tts_tests/test_tacotron2_tf_model.py b/tests/tts_tests/test_tacotron2_tf_model.py index ee7f720b..431b0c2f 100644 --- a/tests/tts_tests/test_tacotron2_tf_model.py +++ b/tests/tts_tests/test_tacotron2_tf_model.py @@ -110,7 +110,7 @@ class TacotronTFTrainTest(unittest.TestCase): num_chars=24, num_speakers=0, r=3, - postnet_output_dim=80, + out_channels=80, decoder_output_dim=80, attn_type="original", attn_win=False, diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py index 577de014..e947a54a 100644 --- a/tests/tts_tests/test_tacotron2_train.py +++ b/tests/tts_tests/test_tacotron2_train.py @@ -13,7 +13,7 @@ config = Tacotron2Config( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=False, phoneme_language="en-us", diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py index 2abd968d..6c673568 100644 --- a/tests/tts_tests/test_tacotron_model.py +++ b/tests/tts_tests/test_tacotron_model.py @@ -6,7 +6,7 @@ import torch from torch import nn, optim from tests import get_tests_input_path -from TTS.tts.configs import TacotronConfig +from TTS.tts.configs import GSTConfig, TacotronConfig from TTS.tts.layers.losses import L1LossMasked from TTS.tts.models.tacotron import Tacotron from TTS.utils.audio import AudioProcessor @@ -17,9 +17,9 @@ torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -c = TacotronConfig() +config_global = TacotronConfig(num_chars=32, num_speakers=5, out_channels=513, decoder_output_dim=80) -ap = AudioProcessor(**c.audio) +ap = AudioProcessor(**config_global.audio) WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") @@ -31,11 +31,12 @@ def count_parameters(model): class TacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[-1] = mel_spec.size(1) stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -44,21 +45,12 @@ class TacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron( - num_chars=32, - num_speakers=5, - postnet_output_dim=c.audio["fft_size"], - decoder_output_dim=c.audio["num_mels"], - r=c.r, - memory_size=c.memory_size, - ).to( - device - ) # FIXME: missing num_speakers parameter to Tacotron ctor + model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) model_ref = copy.deepcopy(model) @@ -66,7 +58,7 @@ class TacotronTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for _ in range(5): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} @@ -91,11 +83,12 @@ class TacotronTrainTest(unittest.TestCase): class MultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[-1] = mel_spec.size(1) stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -104,22 +97,13 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron( - num_chars=32, - num_speakers=5, - postnet_output_dim=c.audio["fft_size"], - decoder_output_dim=c.audio["num_mels"], - r=c.r, - memory_size=c.memory_size, - d_vector_dim=55, - ).to( - device - ) # FIXME: missing num_speakers parameter to Tacotron ctor + config.d_vector_dim = 55 + model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) model_ref = copy.deepcopy(model) @@ -127,7 +111,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for _ in range(5): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings} @@ -152,12 +136,13 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): class TacotronGSTTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() # with random gst mel style input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 120, c.audio["num_mels"]).to(device) - linear_spec = torch.rand(8, 120, c.audio["fft_size"]).to(device) + mel_spec = torch.rand(8, 120, config.audio["num_mels"]).to(device) + linear_spec = torch.rand(8, 120, config.audio["fft_size"] // 2 + 1).to(device) mel_lengths = torch.randint(20, 120, (8,)).long().to(device) mel_lengths[-1] = 120 stop_targets = torch.zeros(8, 120, 1).float().to(device) @@ -166,23 +151,14 @@ class TacotronGSTTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron( - num_chars=32, - num_speakers=5, - use_gst=True, - gst=c.gst, - postnet_output_dim=c.audio["fft_size"], - decoder_output_dim=c.audio["num_mels"], - r=c.r, - memory_size=c.memory_size, - ).to( - device - ) # FIXME: missing num_speakers parameter to Tacotron ctor + config.use_gst = True + config.gst = GSTConfig() + model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() # print(model) print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model))) @@ -191,7 +167,7 @@ class TacotronGSTTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for _ in range(10): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} @@ -220,7 +196,7 @@ class TacotronGSTTrainTest(unittest.TestCase): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - linear_spec = torch.rand(8, mel_spec.size(1), c.audio["fft_size"]).to(device) + linear_spec = torch.rand(8, mel_spec.size(1), config.audio["fft_size"] // 2 + 1).to(device) mel_lengths = torch.randint(20, mel_spec.size(1), (8,)).long().to(device) mel_lengths[-1] = mel_spec.size(1) stop_targets = torch.zeros(8, mel_spec.size(1), 1).float().to(device) @@ -229,23 +205,12 @@ class TacotronGSTTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron( - num_chars=32, - num_speakers=5, - use_gst=True, - gst=c.gst, - postnet_output_dim=c.audio["fft_size"], - decoder_output_dim=c.audio["num_mels"], - r=c.r, - memory_size=c.memory_size, - ).to( - device - ) # FIXME: missing num_speakers parameter to Tacotron ctor + model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() # print(model) print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model))) @@ -254,7 +219,7 @@ class TacotronGSTTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for _ in range(10): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} @@ -278,11 +243,12 @@ class TacotronGSTTrainTest(unittest.TestCase): class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[-1] = mel_spec.size(1) stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -291,24 +257,13 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron( - num_chars=32, - num_speakers=5, - postnet_output_dim=c.audio["fft_size"], - decoder_output_dim=c.audio["num_mels"], - use_gst=True, - gst=c.gst, - r=c.r, - memory_size=c.memory_size, - d_vector_dim=55, - ).to( - device - ) # FIXME: missing num_speakers parameter to Tacotron ctor + config.d_vector_dim = 55 + model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) model_ref = copy.deepcopy(model) @@ -316,7 +271,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for _ in range(5): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings} diff --git a/tests/tts_tests/test_tacotron_train.py b/tests/tts_tests/test_tacotron_train.py index 010154e2..0c35ee28 100644 --- a/tests/tts_tests/test_tacotron_train.py +++ b/tests/tts_tests/test_tacotron_train.py @@ -13,7 +13,7 @@ config = TacotronConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=False, phoneme_language="en-us", diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py index f93a5318..9d4e1933 100644 --- a/tests/vocoder_tests/test_fullband_melgan_train.py +++ b/tests/vocoder_tests/test_fullband_melgan_train.py @@ -12,7 +12,7 @@ config = FullbandMelganConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -29,9 +29,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_hifigan_train.py b/tests/vocoder_tests/test_hifigan_train.py index 11057570..c506fb48 100644 --- a/tests/vocoder_tests/test_hifigan_train.py +++ b/tests/vocoder_tests/test_hifigan_train.py @@ -13,7 +13,7 @@ config = HifiganConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -29,9 +29,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py index 551b786a..6ef9cd49 100644 --- a/tests/vocoder_tests/test_melgan_train.py +++ b/tests/vocoder_tests/test_melgan_train.py @@ -12,7 +12,7 @@ config = MelganConfig( batch_size=4, eval_batch_size=4, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -29,9 +29,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_multiband_melgan_train.py b/tests/vocoder_tests/test_multiband_melgan_train.py index 4f12782f..c49107bd 100644 --- a/tests/vocoder_tests/test_multiband_melgan_train.py +++ b/tests/vocoder_tests/test_multiband_melgan_train.py @@ -12,7 +12,7 @@ config = MultibandMelganConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -29,9 +29,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_parallel_wavegan_train.py b/tests/vocoder_tests/test_parallel_wavegan_train.py index fb6ea87c..a126befe 100644 --- a/tests/vocoder_tests/test_parallel_wavegan_train.py +++ b/tests/vocoder_tests/test_parallel_wavegan_train.py @@ -12,7 +12,7 @@ config = ParallelWaveganConfig( batch_size=4, eval_batch_size=4, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -28,9 +28,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -38,7 +36,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_vocoder_wavernn.py b/tests/vocoder_tests/test_vocoder_wavernn.py index 9c58fa1c..b5c769ee 100644 --- a/tests/vocoder_tests/test_vocoder_wavernn.py +++ b/tests/vocoder_tests/test_vocoder_wavernn.py @@ -3,11 +3,13 @@ import random import numpy as np import torch -from TTS.vocoder.models.wavernn import WaveRNN +from TTS.vocoder.configs import WavernnConfig +from TTS.vocoder.models.wavernn import Wavernn, WavernnArgs def test_wavernn(): - model = WaveRNN( + config = WavernnConfig() + config.model_args = WavernnArgs( rnn_dims=512, fc_dims=512, mode=10, @@ -20,14 +22,30 @@ def test_wavernn(): compute_dims=128, res_out_dims=128, num_res_blocks=10, - hop_length=256, - sample_rate=22050, ) + config.audio.hop_length = 256 + config.audio.sample_rate = 2048 + dummy_x = torch.rand((2, 1280)) dummy_m = torch.rand((2, 80, 9)) y_size = random.randrange(20, 60) dummy_y = torch.rand((80, y_size)) + + # mode: mold + model = Wavernn(config) output = model(dummy_x, dummy_m) - assert np.all(output.shape == (2, 1280, 4 * 256)), output.shape + assert np.all(output.shape == (2, 1280, 30)), output.shape + + # mode: gauss + config.model_params.mode = "gauss" + model = Wavernn(config) + output = model(dummy_x, dummy_m) + assert np.all(output.shape == (2, 1280, 2)), output.shape + + # mode: quantized + config.model_params.mode = 4 + model = Wavernn(config) + output = model(dummy_x, dummy_m) + assert np.all(output.shape == (2, 1280, 2 ** 4)), output.shape output = model.inference(dummy_y, True, 5500, 550) assert np.all(output.shape == (256 * (y_size - 1),)) diff --git a/tests/vocoder_tests/test_wavegrad.py b/tests/vocoder_tests/test_wavegrad.py index a28409e5..43b5f080 100644 --- a/tests/vocoder_tests/test_wavegrad.py +++ b/tests/vocoder_tests/test_wavegrad.py @@ -4,7 +4,8 @@ import numpy as np import torch from torch import optim -from TTS.vocoder.models.wavegrad import Wavegrad +from TTS.vocoder.configs import WavegradConfig +from TTS.vocoder.models.wavegrad import Wavegrad, WavegradArgs # pylint: disable=unused-variable @@ -20,19 +21,16 @@ class WavegradTrainTest(unittest.TestCase): mel_spec = torch.rand(8, 80, 20).to(device) criterion = torch.nn.L1Loss().to(device) - model = Wavegrad( + args = WavegradArgs( in_channels=80, out_channels=1, upsample_factors=[5, 5, 3, 2, 2], upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], ) + config = WavegradConfig(model_params=args) + model = Wavegrad(config) - model_ref = Wavegrad( - in_channels=80, - out_channels=1, - upsample_factors=[5, 5, 3, 2, 2], - upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], - ) + model_ref = Wavegrad(config) model.train() model.to(device) betas = np.linspace(1e-6, 1e-2, 1000) diff --git a/tests/vocoder_tests/test_wavegrad_layers.py b/tests/vocoder_tests/test_wavegrad_layers.py index 0180eb0a..a0b021dc 100644 --- a/tests/vocoder_tests/test_wavegrad_layers.py +++ b/tests/vocoder_tests/test_wavegrad_layers.py @@ -1,7 +1,8 @@ import torch +from TTS.vocoder.configs import WavegradConfig from TTS.vocoder.layers.wavegrad import DBlock, FiLM, PositionalEncoding, UBlock -from TTS.vocoder.models.wavegrad import Wavegrad +from TTS.vocoder.models.wavegrad import Wavegrad, WavegradArgs def test_positional_encoding(): @@ -75,12 +76,14 @@ def test_wavegrad_forward(): c = torch.rand(32, 80, 20) noise_scale = torch.rand(32) - model = Wavegrad( + args = WavegradArgs( in_channels=80, out_channels=1, upsample_factors=[5, 5, 3, 2, 2], upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], ) + config = WavegradConfig(model_params=args) + model = Wavegrad(config) o = model.forward(x, c, noise_scale) assert o.shape[0] == 32 diff --git a/tests/vocoder_tests/test_wavegrad_train.py b/tests/vocoder_tests/test_wavegrad_train.py index e222de3a..fe56ee78 100644 --- a/tests/vocoder_tests/test_wavegrad_train.py +++ b/tests/vocoder_tests/test_wavegrad_train.py @@ -12,7 +12,7 @@ config = WavegradConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -29,15 +29,15 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavegrad.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavegrad.py --continue_path {continue_path} " +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " +) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_wavernn_train.py b/tests/vocoder_tests/test_wavernn_train.py index 414ed719..43fc5fb1 100644 --- a/tests/vocoder_tests/test_wavernn_train.py +++ b/tests/vocoder_tests/test_wavernn_train.py @@ -4,15 +4,18 @@ import shutil from tests import get_device_id, get_tests_output_path, run_cli from TTS.vocoder.configs import WavernnConfig +from TTS.vocoder.models.wavernn import WavernnArgs config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") + config = WavernnConfig( + model_params=WavernnArgs(), batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -28,9 +31,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavernn.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -38,7 +39,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavernn.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) From f8a3460818d17565ed9387b85a7cfff90bc5023f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:28:58 +0200 Subject: [PATCH 084/258] Update tts model configs --- TTS/tts/configs/align_tts_config.py | 21 ++--- TTS/tts/configs/glow_tts_config.py | 84 ++++++++++++++++--- TTS/tts/configs/shared_configs.py | 12 ++- TTS/tts/configs/speedy_speech_config.py | 56 ++----------- TTS/tts/configs/tacotron2_config.py | 105 +----------------------- TTS/tts/configs/tacotron_config.py | 39 +++++++-- 6 files changed, 133 insertions(+), 184 deletions(-) diff --git a/TTS/tts/configs/align_tts_config.py b/TTS/tts/configs/align_tts_config.py index 56622741..837cd519 100644 --- a/TTS/tts/configs/align_tts_config.py +++ b/TTS/tts/configs/align_tts_config.py @@ -2,6 +2,7 @@ from dataclasses import dataclass, field from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig +from TTS.tts.models.align_tts import AlignTTSArgs @dataclass @@ -49,9 +50,9 @@ class AlignTTSConfig(BaseTTSConfig): use_speaker_embedding (bool): enable / disable using speaker embeddings for multi-speaker models. If set True, the model is in the multi-speaker mode. Defaults to False. - use_external_speaker_embedding_file (bool): + use_d_vector_file (bool): enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. - external_speaker_embedding_file (str): + d_vector_file (str): Path to the file including pre-computed speaker embeddings. Defaults to None. noam_schedule (bool): enable / disable the use of Noam LR scheduler. Defaults to False. @@ -68,17 +69,7 @@ class AlignTTSConfig(BaseTTSConfig): model: str = "align_tts" # model specific params - positional_encoding: bool = True - hidden_channels_dp: int = 256 - hidden_channels: int = 256 - encoder_type: str = "fftransformer" - encoder_params: dict = field( - default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1} - ) - decoder_type: str = "fftransformer" - decoder_params: dict = field( - default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1} - ) + model_args: AlignTTSArgs = field(default_factory=AlignTTSArgs) phase_start_steps: List[int] = None ssim_alpha: float = 1.0 @@ -88,8 +79,8 @@ class AlignTTSConfig(BaseTTSConfig): # multi-speaker settings use_speaker_embedding: bool = False - use_external_speaker_embedding_file: bool = False - external_speaker_embedding_file: str = False + use_d_vector_file: bool = False + d_vector_file: str = False # optimizer parameters optimizer: str = "Adam" diff --git a/TTS/tts/configs/glow_tts_config.py b/TTS/tts/configs/glow_tts_config.py index 925854c9..19b7abd9 100644 --- a/TTS/tts/configs/glow_tts_config.py +++ b/TTS/tts/configs/glow_tts_config.py @@ -23,13 +23,49 @@ class GlowTTSConfig(BaseTTSConfig): Defaults to `{"kernel_size": 3, "dropout_p": 0.1, "num_layers": 6, "num_heads": 2, "hidden_channels_ffn": 768}` use_encoder_prenet (bool): enable / disable the use of a prenet for the encoder. Defaults to True. - hidden_channels_encoder (int): + hidden_channels_enc (int): Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes, and for some encoder types internal hidden channels sizes too. Defaults to 192. - hidden_channels_decoder (int): + hidden_channels_dec (int): Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work. - hidden_channels_duration_predictor (int): + hidden_channels_dp (int): Number of layer channels of the duration predictor network. Defaults to 256 as in the original work. + mean_only (bool): + If true predict only the mean values by the decoder flow. Defaults to True. + out_channels (int): + Number of channels of the model output tensor. Defaults to 80. + num_flow_blocks_dec (int): + Number of decoder blocks. Defaults to 12. + inference_noise_scale (float): + Noise scale used at inference. Defaults to 0.33. + kernel_size_dec (int): + Decoder kernel size. Defaults to 5 + dilation_rate (int): + Rate to increase dilation by each layer in a decoder block. Defaults to 5. + num_block_layers (int): + Number of decoder layers in each decoder block. Defaults to 4. + dropout_p_dec (float): + Dropout rate for decoder. Defaults to 0.1. + num_speaker (int): + Number of speaker to define the size of speaker embedding layer. Defaults to 0. + c_in_channels (int): + Number of speaker embedding channels. It is set to 512 if embeddings are learned. Defaults to 0. + num_splits (int): + Number of split levels in inversible conv1x1 operation. Defaults to 4. + num_squeeze (int): + Number of squeeze levels. When squeezing channels increases and time steps reduces by the factor + 'num_squeeze'. Defaults to 1. + sigmoid_scale (bool): + enable/disable sigmoid scaling in decoder. Defaults to False. + mean_only (bool): + If True, encoder only computes mean value and uses constant variance for each time step. Defaults to true. + encoder_type (str): + Encoder module type. Possible values are`["rel_pos_transformer", "gated_conv", "residual_conv_bn", "time_depth_separable"]` + Check `TTS.tts.layers.glow_tts.encoder` for more details. Defaults to `rel_pos_transformers` as in the original paper. + encoder_params (dict): + Encoder module parameters. Defaults to None. + d_vector_dim (int): + Channels of external speaker embedding vectors. Defaults to 0. data_dep_init_steps (int): Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses Activation Normalization that pre-computes normalization stats at the beginning and use the same values @@ -41,9 +77,9 @@ class GlowTTSConfig(BaseTTSConfig): use_speaker_embedding (bool): enable / disable using speaker embeddings for multi-speaker models. If set True, the model is in the multi-speaker mode. Defaults to False. - use_external_speaker_embedding_file (bool): + use_d_vector_file (bool): enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. - external_speaker_embedding_file (str): + d_vector_file (str): Path to the file including pre-computed speaker embeddings. Defaults to None. noam_schedule (bool): enable / disable the use of Noam LR scheduler. Defaults to False. @@ -62,6 +98,7 @@ class GlowTTSConfig(BaseTTSConfig): model: str = "glow_tts" # model params + num_chars: int = None encoder_type: str = "rel_pos_transformer" encoder_params: dict = field( default_factory=lambda: { @@ -73,9 +110,36 @@ class GlowTTSConfig(BaseTTSConfig): } ) use_encoder_prenet: bool = True - hidden_channels_encoder: int = 192 - hidden_channels_decoder: int = 192 - hidden_channels_duration_predictor: int = 256 + hidden_channels_enc: int = 192 + hidden_channels_dec: int = 192 + hidden_channels_dp: int = 256 + dropout_p_dp: float = 0.1 + dropout_p_dec: float = 0.05 + mean_only: bool = True + out_channels: int = 80 + num_flow_blocks_dec: int = 12 + inference_noise_scale: float = 0.33 + kernel_size_dec: int = 5 + dilation_rate: int = 5 + num_block_layers: int = 4 + num_speakers: int = 0 + c_in_channels: int = 0 + num_splits: int = 4 + num_squeeze: int = 1 + sigmoid_scale: bool = False + mean_only: bool = False + encoder_type: str = "rel_pos_transformer" + encoder_params: dict = field( + default_factory=lambda: { + "kernel_size": 3, + "dropout_p": 0.1, + "num_layers": 6, + "num_heads": 2, + "hidden_channels_ffn": 768, + "input_length": None, + } + ) + d_vector_dim: int = 0 # training params data_dep_init_steps: int = 10 @@ -86,8 +150,8 @@ class GlowTTSConfig(BaseTTSConfig): # multi-speaker settings use_speaker_embedding: bool = False - use_external_speaker_embedding_file: bool = False - external_speaker_embedding_file: str = False + use_d_vector_file: bool = False + d_vector_file: str = False # optimizer parameters optimizer: str = "RAdam" diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index dc9c8e0d..8f2878ff 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -1,7 +1,7 @@ from dataclasses import asdict, dataclass, field from typing import List -from coqpit import MISSING, Coqpit, check_argument +from coqpit import Coqpit, check_argument from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig @@ -150,7 +150,7 @@ class BaseTTSConfig(BaseTrainingConfig): use_phonemes: bool = False phoneme_language: str = None compute_input_seq_cache: bool = False - text_cleaner: str = MISSING + text_cleaner: str = None enable_eos_bos_chars: bool = False test_sentences_file: str = "" phoneme_cache_path: str = None @@ -168,10 +168,14 @@ class BaseTTSConfig(BaseTrainingConfig): # dataset datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) # optimizer - optimizer: str = MISSING - optimizer_params: dict = MISSING + optimizer: str = None + optimizer_params: dict = None # scheduler lr_scheduler: str = "" lr_scheduler_params: dict = field(default_factory=lambda: {}) # testing test_sentences: List[str] = field(default_factory=lambda: []) + # multi-speaker + use_speaker_embedding: bool = False + use_d_vector_file: bool = False + d_vector_dim: int = 0 diff --git a/TTS/tts/configs/speedy_speech_config.py b/TTS/tts/configs/speedy_speech_config.py index d76d94e2..b2641ab5 100644 --- a/TTS/tts/configs/speedy_speech_config.py +++ b/TTS/tts/configs/speedy_speech_config.py @@ -2,6 +2,7 @@ from dataclasses import dataclass, field from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig +from TTS.tts.models.speedy_speech import SpeedySpeechArgs @dataclass @@ -16,30 +17,8 @@ class SpeedySpeechConfig(BaseTTSConfig): Args: model (str): Model name used for selecting the right model at initialization. Defaults to `speedy_speech`. - positional_encoding (bool): - enable / disable positional encoding applied to the encoder output. Defaults to True. - hidden_channels (int): - Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder - parameters. Defaults to 128. - encoder_type (str): - Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details. - Defaults to `residual_conv_bn`. - encoder_params (dict): - Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details. - Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1], "num_conv_blocks": 2, "num_res_blocks": 13}` - decoder_type (str): - Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details. - Defaults to `residual_conv_bn`. - decoder_params (dict): - Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details. - Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1], "num_conv_blocks": 2, "num_res_blocks": 17}` - hidden_channels_encoder (int): - Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes, - and for some encoder types internal hidden channels sizes too. Defaults to 192. - hidden_channels_decoder (int): - Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work. - hidden_channels_duration_predictor (int): - Number of layer channels of the duration predictor network. Defaults to 256 as in the original work. + model_args (Coqpit): + Model class arguments. Check `SpeedySpeechArgs` for more details. Defaults to `SpeedySpeechArgs()`. data_dep_init_steps (int): Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses Activation Normalization that pre-computes normalization stats at the beginning and use the same values @@ -47,9 +26,9 @@ class SpeedySpeechConfig(BaseTTSConfig): use_speaker_embedding (bool): enable / disable using speaker embeddings for multi-speaker models. If set True, the model is in the multi-speaker mode. Defaults to False. - use_external_speaker_embedding_file (bool): + use_d_vector_file (bool): enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. - external_speaker_embedding_file (str): + d_vector_file (str): Path to the file including pre-computed speaker embeddings. Defaults to None. noam_schedule (bool): enable / disable the use of Noam LR scheduler. Defaults to False. @@ -73,31 +52,12 @@ class SpeedySpeechConfig(BaseTTSConfig): model: str = "speedy_speech" # model specific params - positional_encoding: bool = True - hidden_channels: int = 128 - encoder_type: str = "residual_conv_bn" - encoder_params: dict = field( - default_factory=lambda: { - "kernel_size": 4, - "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1], - "num_conv_blocks": 2, - "num_res_blocks": 13, - } - ) - decoder_type: str = "residual_conv_bn" - decoder_params: dict = field( - default_factory=lambda: { - "kernel_size": 4, - "dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1], - "num_conv_blocks": 2, - "num_res_blocks": 17, - } - ) + model_args: SpeedySpeechArgs = field(default_factory=SpeedySpeechArgs) # multi-speaker settings use_speaker_embedding: bool = False - use_external_speaker_embedding_file: bool = False - external_speaker_embedding_file: str = False + use_d_vector_file: bool = False + d_vector_file: str = False # optimizer parameters optimizer: str = "RAdam" diff --git a/TTS/tts/configs/tacotron2_config.py b/TTS/tts/configs/tacotron2_config.py index ea66fae8..b622e640 100644 --- a/TTS/tts/configs/tacotron2_config.py +++ b/TTS/tts/configs/tacotron2_config.py @@ -12,107 +12,10 @@ class Tacotron2Config(TacotronConfig): >>> from TTS.tts.configs import Tacotron2Config >>> config = Tacotron2Config() - Args: - model (str): - Model name used to select the right model class to initilize. Defaults to `Tacotron2`. - use_gst (bool): - enable / disable the use of Global Style Token modules. Defaults to False. - gst (GSTConfig): - Instance of `GSTConfig` class. - gst_style_input (str): - Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and - this is not defined, the model uses a zero vector as an input. Defaults to None. - r (int): - Number of output frames that the decoder computed per iteration. Larger values makes training and inference - faster but reduces the quality of the output frames. This needs to be tuned considering your own needs. - Defaults to 1. - gradual_trainin (List[List]): - Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is - the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size. - If sets None, no gradual training is used. Defaults to None. - memory_size (int): - Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame. - Defaults to -1. - prenet_type (str): - `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the - Prenet. Defaults to `original`. - prenet_dropout (bool): - enables / disables the use of dropout in the Prenet. Defaults to True. - prenet_dropout_at_inference (bool): - enable / disable the use of dropout in the Prenet at the inference time. Defaults to False. - stopnet (bool): - enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True. - stopnet_pos_weight (float): - Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with - datasets with longer sentences. Defaults to 10. - separate_stopnet (bool): - Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True. - attention_type (str): - attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'. - attention_heads (int): - Number of attention heads for GMM attention. Defaults to 5. - windowing (bool): - It especially useful at inference to keep attention alignment diagonal. Defaults to False. - use_forward_attn (bool): - It is only valid if ```attn_type``` is ```original```. Defaults to False. - forward_attn_mask (bool): - enable/disable extra masking over forward attention. It is useful at inference to prevent - possible attention failures. Defaults to False. - transition_agent (bool): - enable/disable transition agent in forward attention. Defaults to False. - location_attn (bool): - enable/disable location sensitive attention as in the original Tacotron2 paper. - It is only valid if ```attn_type``` is ```original```. Defaults to True. - bidirectional_decoder (bool): - enable/disable bidirectional decoding. Defaults to False. - double_decoder_consistency (bool): - enable/disable double decoder consistency. Defaults to False. - ddc_r (int): - reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this - as a multiple of the `r` value. Defaults to 6. - use_speaker_embedding (bool): - enable / disable using speaker embeddings for multi-speaker models. If set True, the model is - in the multi-speaker mode. Defaults to False. - use_external_speaker_embedding_file (bool): - enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. - external_speaker_embedding_file (str): - Path to the file including pre-computed speaker embeddings. Defaults to None. - noam_schedule (bool): - enable / disable the use of Noam LR scheduler. Defaults to False. - warmup_steps (int): - Number of warm-up steps for the Noam scheduler. Defaults 4000. - lr (float): - Initial learning rate. Defaults to `1e-4`. - wd (float): - Weight decay coefficient. Defaults to `1e-6`. - grad_clip (float): - Gradient clipping threshold. Defaults to `5`. - seq_len_notm (bool): - enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample - is divided by the sequence length. Defaults to False. - loss_masking (bool): - enable / disable masking the paddings of the samples in loss computation. Defaults to True. - decoder_loss_alpha (float): - Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - postnet_loss_alpha (float): - Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - postnet_diff_spec_alpha (float): - Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - decoder_diff_spec_alpha (float): - Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - decoder_ssim_alpha (float): - Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - postnet_ssim_alpha (float): - Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - ga_alpha (float): - Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss - function. Defaults to 5. + Check `TacotronConfig` for argument descriptions. """ model: str = "tacotron2" + out_channels: int = 80 + encoder_in_features: int = 512 + decoder_in_features: int = 512 diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index 2b67901c..89fb8d81 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -23,6 +23,10 @@ class TacotronConfig(BaseTTSConfig): gst_style_input (str): Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and this is not defined, the model uses a zero vector as an input. Defaults to None. + num_chars (int): + Number of characters used by the model. It must be defined before initializing the model. Defaults to None. + num_speakers (int): + Number of speakers for multi-speaker models. Defaults to 1. r (int): Initial number of output frames that the decoder computed per iteration. Larger values makes training and inference faster but reduces the quality of the output frames. This must be equal to the largest `r` value used in @@ -47,7 +51,13 @@ class TacotronConfig(BaseTTSConfig): Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with datasets with longer sentences. Defaults to 10. max_decoder_steps (int): - Max number of steps allowed for the decoder. Defaults to 10000. + Max number of steps allowed for the decoder. Defaults to 50. + encoder_in_features (int): + Channels of encoder input and character embedding tensors. Defaults to 256. + decoder_in_features (int): + Channels of decoder input and encoder output tensors. Defaults to 256. + out_channels (int): + Channels of the final model output. It must match the spectragram size. Defaults to 80. separate_stopnet (bool): Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True. attention_type (str): @@ -76,9 +86,9 @@ class TacotronConfig(BaseTTSConfig): use_speaker_embedding (bool): enable / disable using speaker embeddings for multi-speaker models. If set True, the model is in the multi-speaker mode. Defaults to False. - use_external_speaker_embedding_file (bool): + use_d_vector_file (bool): enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. - external_speaker_embedding_file (str): + d_vector_file (str): Path to the file including pre-computed speaker embeddings. Defaults to None. optimizer (str): Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`. @@ -111,6 +121,7 @@ class TacotronConfig(BaseTTSConfig): Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the corresponding loss function. Defaults to 0.25 decoder_diff_spec_alpha (float): + Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the corresponding loss function. Defaults to 0.25 decoder_ssim_alpha (float): @@ -125,11 +136,14 @@ class TacotronConfig(BaseTTSConfig): """ model: str = "tacotron" + # model_params: TacotronArgs = field(default_factory=lambda: TacotronArgs()) use_gst: bool = False gst: GSTConfig = None gst_style_input: str = None # model specific params + num_speakers: int = 1 + num_chars: int = 0 r: int = 2 gradual_training: List[List[int]] = None memory_size: int = -1 @@ -139,12 +153,17 @@ class TacotronConfig(BaseTTSConfig): stopnet: bool = True separate_stopnet: bool = True stopnet_pos_weight: float = 10.0 - max_decoder_steps: int = 10000 + max_decoder_steps: int = 500 + encoder_in_features: int = 256 + decoder_in_features: int = 256 + decoder_output_dim: int = 80 + out_channels: int = 513 # attention layers attention_type: str = "original" attention_heads: int = None attention_norm: str = "sigmoid" + attention_win: bool = False windowing: bool = False use_forward_attn: bool = False forward_attn_mask: bool = False @@ -158,8 +177,10 @@ class TacotronConfig(BaseTTSConfig): # multi-speaker settings use_speaker_embedding: bool = False - use_external_speaker_embedding_file: bool = False - external_speaker_embedding_file: str = False + speaker_embedding_dim: int = 512 + use_d_vector_file: bool = False + d_vector_file: str = False + d_vector_dim: int = None # optimizer parameters optimizer: str = "RAdam" @@ -196,3 +217,9 @@ class TacotronConfig(BaseTTSConfig): assert ( self.gradual_training[0][1] == self.r ), f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}" + if self.model == "tacotron" and self.audio is not None: + assert self.out_channels == ( + self.audio.fft_size // 2 + 1 + ), f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}" + if self.model == "tacotron2" and self.audio is not None: + assert self.out_channels == self.audio.num_mels From 70d968b169c17d3245c5e9fc7449c201fd13f637 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:29:35 +0200 Subject: [PATCH 085/258] Update vocoder model configs --- TTS/vocoder/configs/fullband_melgan_config.py | 4 +-- .../configs/multiband_melgan_config.py | 2 +- .../configs/parallel_wavegan_config.py | 2 +- TTS/vocoder/configs/shared_configs.py | 28 +++++++++--------- TTS/vocoder/configs/wavegrad_config.py | 29 +++---------------- TTS/vocoder/configs/wavernn_config.py | 29 +++++-------------- 6 files changed, 29 insertions(+), 65 deletions(-) diff --git a/TTS/vocoder/configs/fullband_melgan_config.py b/TTS/vocoder/configs/fullband_melgan_config.py index 53444214..2ab83aac 100644 --- a/TTS/vocoder/configs/fullband_melgan_config.py +++ b/TTS/vocoder/configs/fullband_melgan_config.py @@ -14,7 +14,7 @@ class FullbandMelganConfig(BaseGANVocoderConfig): Args: model (str): - Model name used for selecting the right model at initialization. Defaults to `melgan`. + Model name used for selecting the right model at initialization. Defaults to `fullband_melgan`. discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to 'melgan_multiscale_discriminator`. discriminator_model_params (dict): The discriminator model parameters. Defaults to @@ -62,7 +62,7 @@ class FullbandMelganConfig(BaseGANVocoderConfig): L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. """ - model: str = "melgan" + model: str = "fullband_melgan" # Model specific params discriminator_model: str = "melgan_multiscale_discriminator" diff --git a/TTS/vocoder/configs/multiband_melgan_config.py b/TTS/vocoder/configs/multiband_melgan_config.py index 81fd7904..76311353 100644 --- a/TTS/vocoder/configs/multiband_melgan_config.py +++ b/TTS/vocoder/configs/multiband_melgan_config.py @@ -14,7 +14,7 @@ class MultibandMelganConfig(BaseGANVocoderConfig): Args: model (str): - Model name used for selecting the right model at initialization. Defaults to `melgan`. + Model name used for selecting the right model at initialization. Defaults to `multiband_melgan`. discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to 'melgan_multiscale_discriminator`. discriminator_model_params (dict): The discriminator model parameters. Defaults to diff --git a/TTS/vocoder/configs/parallel_wavegan_config.py b/TTS/vocoder/configs/parallel_wavegan_config.py index d132d2e1..a89b1f3f 100644 --- a/TTS/vocoder/configs/parallel_wavegan_config.py +++ b/TTS/vocoder/configs/parallel_wavegan_config.py @@ -9,7 +9,7 @@ class ParallelWaveganConfig(BaseGANVocoderConfig): Args: model (str): - Model name used for selecting the right configuration at initialization. Defaults to `parallel_wavegan`. + Model name used for selecting the right configuration at initialization. Defaults to `gan`. discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to 'parallel_wavegan_discriminator`. discriminator_model_params (dict): The discriminator model kwargs. Defaults to diff --git a/TTS/vocoder/configs/shared_configs.py b/TTS/vocoder/configs/shared_configs.py index 664032d2..6891ce6c 100644 --- a/TTS/vocoder/configs/shared_configs.py +++ b/TTS/vocoder/configs/shared_configs.py @@ -34,6 +34,10 @@ class BaseVocoderConfig(BaseTrainingConfig): Number of training epochs to. Defaults to 10000. wd (float): Weight decay. + optimizer (torch.optim.Optimizer): + Optimizer used for the training. Defaults to `AdamW`. + optimizer_params (dict): + Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}` """ audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) @@ -50,6 +54,8 @@ class BaseVocoderConfig(BaseTrainingConfig): # OPTIMIZER epochs: int = 10000 # total number of epochs to train. wd: float = 0.0 # Weight decay weight. + optimizer: str = "AdamW" + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "weight_decay": 0.0}) @dataclass @@ -96,20 +102,13 @@ class BaseGANVocoderConfig(BaseVocoderConfig): }` target_loss (str): Target loss name that defines the quality of the model. Defaults to `avg_G_loss`. - gen_clip_grad (float): - Gradient clipping threshold for the generator model. Any value less than 0 disables clipping. - Defaults to -1. - disc_clip_grad (float): - Gradient clipping threshold for the discriminator model. Any value less than 0 disables clipping. - Defaults to -1. + grad_clip (list): + A list of gradient clipping theresholds for each optimizer. Any value less than 0 disables clipping. + Defaults to [5, 5]. lr_gen (float): Generator model initial learning rate. Defaults to 0.0002. lr_disc (float): Discriminator model initial learning rate. Defaults to 0.0002. - optimizer (torch.optim.Optimizer): - Optimizer used for the training. Defaults to `AdamW`. - optimizer_params (dict): - Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}` lr_scheduler_gen (torch.optim.Scheduler): Learning rate scheduler for the generator. Defaults to `ExponentialLR`. lr_scheduler_gen_params (dict): @@ -127,6 +126,8 @@ class BaseGANVocoderConfig(BaseVocoderConfig): Enabling it results in slower iterations but faster convergance in some cases. Defaults to False. """ + model: str = "gan" + # LOSS PARAMETERS use_stft_loss: bool = True use_subband_stft_loss: bool = True @@ -164,15 +165,12 @@ class BaseGANVocoderConfig(BaseVocoderConfig): } ) - target_loss: str = "avg_G_loss" # loss value to pick the best model to save after each epoch + target_loss: str = "loss_0" # loss value to pick the best model to save after each epoch # optimizer - gen_clip_grad: float = -1 # Generator gradient clipping threshold. Apply gradient clipping if > 0 - disc_clip_grad: float = -1 # Discriminator gradient clipping threshold. + grad_clip: float = field(default_factory=lambda: [5, 5]) lr_gen: float = 0.0002 # Initial learning rate. lr_disc: float = 0.0002 # Initial learning rate. - optimizer: str = "AdamW" - optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "weight_decay": 0.0}) lr_scheduler_gen: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1}) lr_scheduler_disc: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html diff --git a/TTS/vocoder/configs/wavegrad_config.py b/TTS/vocoder/configs/wavegrad_config.py index 271422ee..c39813ae 100644 --- a/TTS/vocoder/configs/wavegrad_config.py +++ b/TTS/vocoder/configs/wavegrad_config.py @@ -1,6 +1,7 @@ from dataclasses import dataclass, field from TTS.vocoder.configs.shared_configs import BaseVocoderConfig +from TTS.vocoder.models.wavegrad import WavegradArgs @dataclass @@ -16,19 +17,7 @@ class WavegradConfig(BaseVocoderConfig): Model name used for selecting the right model at initialization. Defaults to `wavegrad`. generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is considered as a generator too. Defaults to `wavegrad`. - model_params (dict): - WaveGrad kwargs. Defaults to - ` - { - "use_weight_norm": True, - "y_conv_channels": 32, - "x_conv_channels": 768, - "ublock_out_channels": [512, 512, 256, 128, 128], - "dblock_out_channels": [128, 128, 256, 512], - "upsample_factors": [4, 4, 4, 2, 2], - "upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], - } - ` + model_params (WavegradArgs): Model parameters. Check `WavegradArgs` for default values. target_loss (str): Target loss name that defines the quality of the model. Defaults to `avg_wavegrad_loss`. epochs (int): @@ -70,18 +59,8 @@ class WavegradConfig(BaseVocoderConfig): model: str = "wavegrad" # Model specific params generator_model: str = "wavegrad" - model_params: dict = field( - default_factory=lambda: { - "use_weight_norm": True, - "y_conv_channels": 32, - "x_conv_channels": 768, - "ublock_out_channels": [512, 512, 256, 128, 128], - "dblock_out_channels": [128, 128, 256, 512], - "upsample_factors": [4, 4, 4, 2, 2], - "upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], - } - ) - target_loss: str = "avg_wavegrad_loss" # loss value to pick the best model to save after each epoch + model_params: WavegradArgs = field(default_factory=WavegradArgs) + target_loss: str = "loss" # loss value to pick the best model to save after each epoch # Training - overrides epochs: int = 10000 diff --git a/TTS/vocoder/configs/wavernn_config.py b/TTS/vocoder/configs/wavernn_config.py index 95a3cfc4..0afa1f43 100644 --- a/TTS/vocoder/configs/wavernn_config.py +++ b/TTS/vocoder/configs/wavernn_config.py @@ -1,6 +1,7 @@ from dataclasses import dataclass, field from TTS.vocoder.configs.shared_configs import BaseVocoderConfig +from TTS.vocoder.models.wavernn import WavernnArgs @dataclass @@ -47,9 +48,7 @@ class WavernnConfig(BaseVocoderConfig): Batch size used at training. Larger values use more memory. Defaults to 256. seq_len (int): Audio segment length used at training. Larger values use more memory. Defaults to 1280. - padding (int): - Padding applied to the input feature frames against the convolution layers of the feature network. - Defaults to 2. + use_noise_augment (bool): enable / disable random noise added to the input waveform. The noise is added after computing the features. Defaults to True. @@ -60,7 +59,7 @@ class WavernnConfig(BaseVocoderConfig): enable / disable mixed precision training. Default is True. eval_split_size (int): Number of samples used for evalutaion. Defaults to 50. - test_every_epoch (int): + num_epochs_before_test (int): Number of epochs waited to run the next evalution. Since inference takes some time, it is better to wait some number of epochs not ot waste training time. Defaults to 10. grad_clip (float): @@ -76,21 +75,8 @@ class WavernnConfig(BaseVocoderConfig): model: str = "wavernn" # Model specific params - mode: str = "mold" # mold [string], gauss [string], bits [int] - mulaw: bool = True # apply mulaw if mode is bits - generator_model: str = "WaveRNN" - wavernn_model_params: dict = field( - default_factory=lambda: { - "rnn_dims": 512, - "fc_dims": 512, - "compute_dims": 128, - "res_out_dims": 128, - "num_res_blocks": 10, - "use_aux_net": True, - "use_upsample_net": True, - "upsample_factors": [4, 8, 8], # this needs to correctly factorise hop_length - } - ) + model_params: WavernnArgs = field(default_factory=WavernnArgs) + target_loss: str = "loss" # Inference batched: bool = True @@ -101,12 +87,13 @@ class WavernnConfig(BaseVocoderConfig): epochs: int = 10000 batch_size: int = 256 seq_len: int = 1280 - padding: int = 2 use_noise_augment: bool = False use_cache: bool = True mixed_precision: bool = True eval_split_size: int = 50 - test_every_epochs: int = 10 # number of epochs to wait until the next test run (synthesizing a full audio clip). + num_epochs_before_test: int = ( + 10 # number of epochs to wait until the next test run (synthesizing a full audio clip). + ) # optimizer overrides grad_clip: float = 4.0 From cae702980f5ef8fd5aaca1ce4a809ff6f046bd35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:35:36 +0200 Subject: [PATCH 086/258] =?UTF-8?q?Create=20base=20=F0=9F=90=B8TTS=20model?= =?UTF-8?q?=20abstraction=20for=20tts=20models?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TTS/{tts/models/abstract_tts.py => model.py} | 41 ++- TTS/tts/models/align_tts.py | 159 +++++++---- TTS/tts/models/base_tacotron.py | 286 +++++++++++++++++++ TTS/tts/models/base_tts.py | 233 +++++++++++++++ TTS/tts/models/glow_tts.py | 155 ++++------ TTS/tts/models/speedy_speech.py | 147 +++++++--- TTS/tts/models/tacotron.py | 216 +++++--------- TTS/tts/models/tacotron2.py | 206 +++++-------- TTS/tts/tf/models/tacotron2.py | 6 +- TTS/vocoder/models/base_vocoder.py | 20 ++ 10 files changed, 968 insertions(+), 501 deletions(-) rename TTS/{tts/models/abstract_tts.py => model.py} (86%) create mode 100644 TTS/tts/models/base_tacotron.py create mode 100644 TTS/tts/models/base_tts.py create mode 100644 TTS/vocoder/models/base_vocoder.py diff --git a/TTS/tts/models/abstract_tts.py b/TTS/model.py similarity index 86% rename from TTS/tts/models/abstract_tts.py rename to TTS/model.py index 9132f7eb..aefb925e 100644 --- a/TTS/tts/models/abstract_tts.py +++ b/TTS/model.py @@ -1,9 +1,9 @@ -from coqpit import Coqpit from abc import ABC, abstractmethod -from typing import Dict, Tuple +from typing import Dict, List, Tuple, Union import numpy as np import torch +from coqpit import Coqpit from torch import nn from TTS.utils.audio import AudioProcessor @@ -11,8 +11,8 @@ from TTS.utils.audio import AudioProcessor # pylint: skip-file -class TTSModel(nn.Module, ABC): - """Abstract TTS class. Every new `tts` model must inherit this. +class BaseModel(nn.Module, ABC): + """Abstract 🐸TTS class. Every new 🐸TTS model must inherit this. Notes on input/output tensor shapes: Any input or output tensor of the model must be shaped as @@ -77,7 +77,6 @@ class TTSModel(nn.Module, ABC): ... return outputs_dict, loss_dict - @abstractmethod def train_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: """Create visualizations and waveform examples for training. @@ -92,10 +91,7 @@ class TTSModel(nn.Module, ABC): Returns: Tuple[Dict, np.ndarray]: training plots and output waveform. """ - figures_dict = {} - output_wav = np.array() - ... - return figures_dict, output_wav + return None, None @abstractmethod def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: @@ -114,13 +110,9 @@ class TTSModel(nn.Module, ABC): ... return outputs_dict, loss_dict - @abstractmethod def eval_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: """The same as `train_log()`""" - figures_dict = {} - output_wav = np.array() - ... - return figures_dict, output_wav + return None, None @abstractmethod def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False) -> None: @@ -132,3 +124,24 @@ class TTSModel(nn.Module, ABC): eval (bool, optional): If true, init model for inference else for training. Defaults to False. """ ... + + def get_optimizer(self) -> Union["Optimizer", List["Optimizer"]]: + """Setup an return optimizer or optimizers.""" + pass + + def get_lr(self) -> Union[float, List[float]]: + """Return learning rate(s). + + Returns: + Union[float, List[float]]: Model's initial learning rates. + """ + pass + + def get_scheduler(self, optimizer: torch.optim.Optimizer): + pass + + def get_criterion(self): + pass + + def format_batch(self): + pass diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 75fb50de..dbd57b83 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -1,5 +1,9 @@ +from dataclasses import dataclass, field +from typing import Dict, Tuple + import torch import torch.nn as nn +from coqpit import Coqpit from TTS.tts.layers.align_tts.mdn import MDNBlock from TTS.tts.layers.feed_forward.decoder import Decoder @@ -7,36 +11,16 @@ from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path -from TTS.tts.models.abstract_tts import TTSModel +from TTS.tts.models.base_tts import BaseTTS from TTS.tts.utils.data import sequence_mask from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor -class AlignTTS(TTSModel): - """AlignTTS with modified duration predictor. - https://arxiv.org/pdf/2003.01950.pdf - - Encoder -> DurationPredictor -> Decoder - - AlignTTS's Abstract - Targeting at both high efficiency and performance, we propose AlignTTS to predict the - mel-spectrum in parallel. AlignTTS is based on a Feed-Forward Transformer which generates mel-spectrum from a - sequence of characters, and the duration of each character is determined by a duration predictor.Instead of - adopting the attention mechanism in Transformer TTS to align text to mel-spectrum, the alignment loss is presented - to consider all possible alignments in training by use of dynamic programming. Experiments on the LJSpeech dataset s - how that our model achieves not only state-of-the-art performance which outperforms Transformer TTS by 0.03 in mean - option score (MOS), but also a high efficiency which is more than 50 times faster than real-time. - - Note: - Original model uses a separate character embedding layer for duration predictor. However, it causes the - duration predictor to overfit and prevents learning higher level interactions among characters. Therefore, - we predict durations based on encoder outputs which has higher level information about input characters. This - enables training without phases as in the original paper. - - Original model uses Transormers in encoder and decoder layers. However, here you can set the architecture - differently based on your requirements using ```encoder_type``` and ```decoder_type``` parameters. - +@dataclass +class AlignTTSArgs(Coqpit): + """ Args: num_chars (int): number of unique input to characters @@ -64,43 +48,98 @@ class AlignTTS(TTSModel): number of channels in speaker embedding vectors. Defaults to 0. """ + num_chars: int = None + out_channels: int = 80 + hidden_channels: int = 256 + hidden_channels_dp: int = 256 + encoder_type: str = "fftransformer" + encoder_params: dict = field( + default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1} + ) + decoder_type: str = "fftransformer" + decoder_params: dict = field( + default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1} + ) + length_scale: float = 1.0 + num_speakers: int = 0 + use_speaker_embedding: bool = False + use_d_vector_file: bool = False + d_vector_dim: int = 0 + + +class AlignTTS(BaseTTS): + """AlignTTS with modified duration predictor. + https://arxiv.org/pdf/2003.01950.pdf + + Encoder -> DurationPredictor -> Decoder + + Check ```AlignTTSArgs``` for the class arguments. + + Examples: + >>> from TTS.tts.configs import AlignTTSConfig + >>> config = AlignTTSConfig() + >>> config.model_args.num_chars = 50 + >>> model = AlignTTS(config) + + Paper Abstract: + Targeting at both high efficiency and performance, we propose AlignTTS to predict the + mel-spectrum in parallel. AlignTTS is based on a Feed-Forward Transformer which generates mel-spectrum from a + sequence of characters, and the duration of each character is determined by a duration predictor.Instead of + adopting the attention mechanism in Transformer TTS to align text to mel-spectrum, the alignment loss is presented + to consider all possible alignments in training by use of dynamic programming. Experiments on the LJSpeech dataset s + how that our model achieves not only state-of-the-art performance which outperforms Transformer TTS by 0.03 in mean + option score (MOS), but also a high efficiency which is more than 50 times faster than real-time. + + Note: + Original model uses a separate character embedding layer for duration predictor. However, it causes the + duration predictor to overfit and prevents learning higher level interactions among characters. Therefore, + we predict durations based on encoder outputs which has higher level information about input characters. This + enables training without phases as in the original paper. + + Original model uses Transormers in encoder and decoder layers. However, here you can set the architecture + differently based on your requirements using ```encoder_type``` and ```decoder_type``` parameters. + + """ + # pylint: disable=dangerous-default-value - def __init__( - self, - num_chars, - out_channels, - hidden_channels=256, - hidden_channels_dp=256, - encoder_type="fftransformer", - encoder_params={"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}, - decoder_type="fftransformer", - decoder_params={"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}, - length_scale=1, - num_speakers=0, - external_c=False, - c_in_channels=0, - ): + def __init__(self, config: Coqpit): super().__init__() + self.config = config self.phase = -1 - self.length_scale = float(length_scale) if isinstance(length_scale, int) else length_scale - self.emb = nn.Embedding(num_chars, hidden_channels) - self.pos_encoder = PositionalEncoding(hidden_channels) - self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, encoder_params, c_in_channels) - self.decoder = Decoder(out_channels, hidden_channels, decoder_type, decoder_params) - self.duration_predictor = DurationPredictor(hidden_channels_dp) + self.length_scale = ( + float(config.model_args.length_scale) + if isinstance(config.model_args.length_scale, int) + else config.model_args.length_scale + ) + self.emb = nn.Embedding(self.config.model_args.num_chars, self.config.model_args.hidden_channels) - self.mod_layer = nn.Conv1d(hidden_channels, hidden_channels, 1) - self.mdn_block = MDNBlock(hidden_channels, 2 * out_channels) + self.embedded_speaker_dim = 0 + self.init_multispeaker(config) - if num_speakers > 1 and not external_c: - # speaker embedding layer - self.emb_g = nn.Embedding(num_speakers, c_in_channels) - nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) + self.pos_encoder = PositionalEncoding(config.model_args.hidden_channels) + self.encoder = Encoder( + config.model_args.hidden_channels, + config.model_args.hidden_channels, + config.model_args.encoder_type, + config.model_args.encoder_params, + self.embedded_speaker_dim, + ) + self.decoder = Decoder( + config.model_args.out_channels, + config.model_args.hidden_channels, + config.model_args.decoder_type, + config.model_args.decoder_params, + ) + self.duration_predictor = DurationPredictor(config.model_args.hidden_channels_dp) - if c_in_channels > 0 and c_in_channels != hidden_channels: - self.proj_g = nn.Conv1d(c_in_channels, hidden_channels, 1) + self.mod_layer = nn.Conv1d(config.model_args.hidden_channels, config.model_args.hidden_channels, 1) + + self.mdn_block = MDNBlock(config.model_args.hidden_channels, 2 * config.model_args.out_channels) + + if self.embedded_speaker_dim > 0 and self.embedded_speaker_dim != config.model_args.hidden_channels: + self.proj_g = nn.Conv1d(self.embedded_speaker_dim, config.model_args.hidden_channels, 1) @staticmethod def compute_log_probs(mu, log_sigma, y): @@ -164,11 +203,12 @@ class AlignTTS(TTSModel): # project g to decoder dim. if hasattr(self, "proj_g"): g = self.proj_g(g) + return x + g def _forward_encoder(self, x, x_lengths, g=None): if hasattr(self, "emb_g"): - g = nn.functional.normalize(self.emb_g(g)) # [B, C, 1] + g = nn.functional.normalize(self.speaker_embedding(g)) # [B, C, 1] if g is not None: g = g.unsqueeze(-1) @@ -315,7 +355,9 @@ class AlignTTS(TTSModel): loss_dict["align_error"] = align_error return outputs, loss_dict - def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): # pylint: disable=no-self-use + def train_log( + self, ap: AudioProcessor, batch: dict, outputs: dict + ) -> Tuple[Dict, Dict]: # pylint: disable=no-self-use model_outputs = outputs["model_outputs"] alignments = outputs["alignments"] mel_input = batch["mel_input"] @@ -332,7 +374,7 @@ class AlignTTS(TTSModel): # Sample audio train_audio = ap.inv_melspectrogram(pred_spec.T) - return figures, train_audio + return figures, {"audio": train_audio} def eval_step(self, batch: dict, criterion: nn.Module): return self.train_step(batch, criterion) @@ -349,6 +391,11 @@ class AlignTTS(TTSModel): self.eval() assert not self.training + def get_criterion(self): + from TTS.tts.layers.losses import AlignTTSLoss # pylint: disable=import-outside-toplevel + + return AlignTTSLoss(self.config) + @staticmethod def _set_phase(config, global_step): """Decide AlignTTS training phase""" diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py new file mode 100644 index 00000000..a99e1926 --- /dev/null +++ b/TTS/tts/models/base_tacotron.py @@ -0,0 +1,286 @@ +import copy +from abc import abstractmethod +from dataclasses import dataclass +from typing import Dict, List + +import torch +from coqpit import MISSING, Coqpit +from torch import nn + +from TTS.tts.layers.losses import TacotronLoss +from TTS.tts.models.base_tts import BaseTTS +from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager +from TTS.tts.utils.text import make_symbols +from TTS.utils.generic_utils import format_aux_input +from TTS.utils.training import gradual_training_scheduler + + +@dataclass +class BaseTacotronArgs(Coqpit): + """TODO: update Tacotron configs using it""" + + num_chars: int = MISSING + num_speakers: int = MISSING + r: int = MISSING + out_channels: int = 80 + decoder_output_dim: int = 80 + attn_type: str = "original" + attn_win: bool = False + attn_norm: str = "softmax" + prenet_type: str = "original" + prenet_dropout: bool = True + prenet_dropout_at_inference: bool = False + forward_attn: bool = False + trans_agent: bool = False + forward_attn_mask: bool = False + location_attn: bool = True + attn_K: int = 5 + separate_stopnet: bool = True + bidirectional_decoder: bool = False + double_decoder_consistency: bool = False + ddc_r: int = None + encoder_in_features: int = 512 + decoder_in_features: int = 512 + d_vector_dim: int = None + use_gst: bool = False + gst: bool = None + gradual_training: bool = None + + +class BaseTacotron(BaseTTS): + def __init__(self, config: Coqpit): + """Abstract Tacotron class""" + super().__init__() + + for key in config: + setattr(self, key, config[key]) + + # layers + self.embedding = None + self.encoder = None + self.decoder = None + self.postnet = None + + # init tensors + self.embedded_speakers = None + self.embedded_speakers_projected = None + + # global style token + if self.gst and self.use_gst: + self.decoder_in_features += self.gst.gst_embedding_dim # add gst embedding dim + self.gst_layer = None + + # additional layers + self.decoder_backward = None + self.coarse_decoder = None + + # init multi-speaker layers + self.init_multispeaker(config) + + @staticmethod + def _format_aux_input(aux_input: Dict) -> Dict: + return format_aux_input({"d_vectors": None, "speaker_ids": None}, aux_input) + + ############################# + # INIT FUNCTIONS + ############################# + + def _init_states(self): + self.embedded_speakers = None + self.embedded_speakers_projected = None + + def _init_backward_decoder(self): + self.decoder_backward = copy.deepcopy(self.decoder) + + def _init_coarse_decoder(self): + self.coarse_decoder = copy.deepcopy(self.decoder) + self.coarse_decoder.r_init = self.ddc_r + self.coarse_decoder.set_r(self.ddc_r) + + ############################# + # CORE FUNCTIONS + ############################# + + @abstractmethod + def forward(self): + pass + + @abstractmethod + def inference(self): + pass + + def load_checkpoint( + self, config, checkpoint_path, eval=False + ): # pylint: disable=unused-argument, redefined-builtin + state = torch.load(checkpoint_path, map_location=torch.device("cpu")) + self.load_state_dict(state["model"]) + if "r" in state: + self.decoder.set_r(state["r"]) + else: + self.decoder.set_r(state["config"]["r"]) + if eval: + self.eval() + assert not self.training + + def get_criterion(self) -> nn.Module: + return TacotronLoss(self.config) + + @staticmethod + def get_characters(config: Coqpit) -> str: + # TODO: implement CharacterProcessor + if config.characters is not None: + symbols, phonemes = make_symbols(**config.characters) + else: + from TTS.tts.utils.text.symbols import ( # pylint: disable=import-outside-toplevel + parse_symbols, + phonemes, + symbols, + ) + + config.characters = parse_symbols() + model_characters = phonemes if config.use_phonemes else symbols + return model_characters, config + + @staticmethod + def get_speaker_manager(config: Coqpit, restore_path: str, data: List, out_path: str = None) -> SpeakerManager: + return get_speaker_manager(config, restore_path, data, out_path) + + def get_aux_input(self, **kwargs) -> Dict: + """Compute Tacotron's auxiliary inputs based on model config. + - speaker d_vector + - style wav for GST + - speaker ID for speaker embedding + """ + # setup speaker_id + if self.config.use_speaker_embedding: + speaker_id = kwargs.get("speaker_id", 0) + else: + speaker_id = None + # setup d_vector + d_vector = ( + self.speaker_manager.get_d_vectors_by_speaker(self.speaker_manager.speaker_names[0]) + if self.config.use_d_vector_file and self.config.use_speaker_embedding + else None + ) + # setup style_mel + if "style_wav" in kwargs: + style_wav = kwargs["style_wav"] + elif self.config.has("gst_style_input"): + style_wav = self.config.gst_style_input + else: + style_wav = None + if style_wav is None and "use_gst" in self.config and self.config.use_gst: + # inicialize GST with zero dict. + style_wav = {} + print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") + for i in range(self.config.gst["gst_num_style_tokens"]): + style_wav[str(i)] = 0 + aux_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector} + return aux_inputs + + ############################# + # COMMON COMPUTE FUNCTIONS + ############################# + + def compute_masks(self, text_lengths, mel_lengths): + """Compute masks against sequence paddings.""" + # B x T_in_max (boolean) + input_mask = sequence_mask(text_lengths) + output_mask = None + if mel_lengths is not None: + max_len = mel_lengths.max() + r = self.decoder.r + max_len = max_len + (r - (max_len % r)) if max_len % r > 0 else max_len + output_mask = sequence_mask(mel_lengths, max_len=max_len) + return input_mask, output_mask + + def _backward_pass(self, mel_specs, encoder_outputs, mask): + """Run backwards decoder""" + decoder_outputs_b, alignments_b, _ = self.decoder_backward( + encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask + ) + decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous() + return decoder_outputs_b, alignments_b + + def _coarse_decoder_pass(self, mel_specs, encoder_outputs, alignments, input_mask): + """Double Decoder Consistency""" + T = mel_specs.shape[1] + if T % self.coarse_decoder.r > 0: + padding_size = self.coarse_decoder.r - (T % self.coarse_decoder.r) + mel_specs = torch.nn.functional.pad(mel_specs, (0, 0, 0, padding_size, 0, 0)) + decoder_outputs_backward, alignments_backward, _ = self.coarse_decoder( + encoder_outputs.detach(), mel_specs, input_mask + ) + # scale_factor = self.decoder.r_init / self.decoder.r + alignments_backward = torch.nn.functional.interpolate( + alignments_backward.transpose(1, 2), size=alignments.shape[1], mode="nearest" + ).transpose(1, 2) + decoder_outputs_backward = decoder_outputs_backward.transpose(1, 2) + decoder_outputs_backward = decoder_outputs_backward[:, :T, :] + return decoder_outputs_backward, alignments_backward + + ############################# + # EMBEDDING FUNCTIONS + ############################# + + def compute_speaker_embedding(self, speaker_ids): + """Compute speaker embedding vectors""" + if hasattr(self, "speaker_embedding") and speaker_ids is None: + raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided") + if hasattr(self, "speaker_embedding") and speaker_ids is not None: + self.embedded_speakers = self.speaker_embedding(speaker_ids).unsqueeze(1) + if hasattr(self, "speaker_project_mel") and speaker_ids is not None: + self.embedded_speakers_projected = self.speaker_project_mel(self.embedded_speakers).squeeze(1) + + def compute_gst(self, inputs, style_input, speaker_embedding=None): + """Compute global style token""" + if isinstance(style_input, dict): + query = torch.zeros(1, 1, self.gst.gst_embedding_dim // 2).type_as(inputs) + if speaker_embedding is not None: + query = torch.cat([query, speaker_embedding.reshape(1, 1, -1)], dim=-1) + + _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) + gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).type_as(inputs) + for k_token, v_amplifier in style_input.items(): + key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) + gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) + gst_outputs = gst_outputs + gst_outputs_att * v_amplifier + elif style_input is None: + gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).type_as(inputs) + else: + gst_outputs = self.gst_layer(style_input, speaker_embedding) # pylint: disable=not-callable + inputs = self._concat_speaker_embedding(inputs, gst_outputs) + return inputs + + @staticmethod + def _add_speaker_embedding(outputs, embedded_speakers): + embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1) + outputs = outputs + embedded_speakers_ + return outputs + + @staticmethod + def _concat_speaker_embedding(outputs, embedded_speakers): + embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1) + outputs = torch.cat([outputs, embedded_speakers_], dim=-1) + return outputs + + ############################# + # CALLBACKS + ############################# + + def on_epoch_start(self, trainer): + """Callback for setting values wrt gradual training schedule. + + Args: + trainer (TrainerTTS): TTS trainer object that is used to train this model. + """ + if self.gradual_training: + r, trainer.config.batch_size = gradual_training_scheduler(trainer.total_steps_done, trainer.config) + trainer.config.r = r + self.decoder.set_r(r) + if trainer.config.bidirectional_decoder: + trainer.model.decoder_backward.set_r(r) + trainer.train_loader = trainer.setup_train_dataloader(self.ap, self.model.decoder.r, verbose=True) + trainer.eval_loader = trainer.setup_eval_dataloder(self.ap, self.model.decoder.r) + print(f"\n > Number of output frames: {self.decoder.r}") diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py new file mode 100644 index 00000000..1de7ba92 --- /dev/null +++ b/TTS/tts/models/base_tts.py @@ -0,0 +1,233 @@ +from typing import Dict, List, Tuple + +import numpy as np +import torch +from coqpit import Coqpit +from torch import nn +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler + +from TTS.model import BaseModel +from TTS.tts.datasets import TTSDataset +from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager +from TTS.tts.utils.synthesis import synthesis +from TTS.tts.utils.text import make_symbols +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor + +# pylint: skip-file + + +class BaseTTS(BaseModel): + """Abstract `tts` class. Every new `tts` model must inherit this. + + It defines `tts` specific functions on top of `Model`. + + Notes on input/output tensor shapes: + Any input or output tensor of the model must be shaped as + + - 3D tensors `batch x time x channels` + - 2D tensors `batch x channels` + - 1D tensors `batch x 1` + """ + + @staticmethod + def get_characters(config: Coqpit) -> str: + # TODO: implement CharacterProcessor + if config.characters is not None: + symbols, phonemes = make_symbols(**config.characters) + else: + from TTS.tts.utils.text.symbols import parse_symbols, phonemes, symbols + + config.characters = parse_symbols() + model_characters = phonemes if config.use_phonemes else symbols + return model_characters, config + + def get_speaker_manager(config: Coqpit, restore_path: str, data: List, out_path: str = None) -> SpeakerManager: + return get_speaker_manager(config, restore_path, data, out_path) + + def init_multispeaker(self, config: Coqpit, data: List = None): + """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer + or with external `d_vectors` computed from a speaker encoder model. + + If you need a different behaviour, override this function for your model. + + Args: + config (Coqpit): Model configuration. + data (List, optional): Dataset items to infer number of speakers. Defaults to None. + """ + # init speaker manager + self.speaker_manager = get_speaker_manager(config, data=data) + self.num_speakers = self.speaker_manager.num_speakers + # init speaker embedding layer + if config.use_speaker_embedding and not config.use_d_vector_file: + self.embedded_speaker_dim = ( + config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512 + ) + self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) + self.speaker_embedding.weight.data.normal_(0, 0.3) + + def get_aux_input(self, **kwargs) -> Dict: + """Prepare and return `aux_input` used by `forward()`""" + pass + + def format_batch(self, batch: Dict) -> Dict: + """Generic batch formatting for `TTSDataset`. + + You must override this if you use a custom dataset. + + Args: + batch (Dict): [description] + + Returns: + Dict: [description] + """ + # setup input batch + text_input = batch[0] + text_lengths = batch[1] + speaker_names = batch[2] + linear_input = batch[3] if self.config.model.lower() in ["tacotron"] else None + mel_input = batch[4] + mel_lengths = batch[5] + stop_targets = batch[6] + item_idx = batch[7] + d_vectors = batch[8] + speaker_ids = batch[9] + attn_mask = batch[10] + max_text_length = torch.max(text_lengths.float()) + max_spec_length = torch.max(mel_lengths.float()) + + # compute durations from attention masks + durations = None + if attn_mask is not None: + durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2]) + for idx, am in enumerate(attn_mask): + # compute raw durations + c_idxs = am[:, : text_lengths[idx], : mel_lengths[idx]].max(1)[1] + # c_idxs, counts = torch.unique_consecutive(c_idxs, return_counts=True) + c_idxs, counts = torch.unique(c_idxs, return_counts=True) + dur = torch.ones([text_lengths[idx]]).to(counts.dtype) + dur[c_idxs] = counts + # smooth the durations and set any 0 duration to 1 + # by cutting off from the largest duration indeces. + extra_frames = dur.sum() - mel_lengths[idx] + largest_idxs = torch.argsort(-dur)[:extra_frames] + dur[largest_idxs] -= 1 + assert ( + dur.sum() == mel_lengths[idx] + ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + durations[idx, : text_lengths[idx]] = dur + + # set stop targets view, we predict a single stop token per iteration. + stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // self.config.r, -1) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) + + return { + "text_input": text_input, + "text_lengths": text_lengths, + "speaker_names": speaker_names, + "mel_input": mel_input, + "mel_lengths": mel_lengths, + "linear_input": linear_input, + "stop_targets": stop_targets, + "attn_mask": attn_mask, + "durations": durations, + "speaker_ids": speaker_ids, + "d_vectors": d_vectors, + "max_text_length": float(max_text_length), + "max_spec_length": float(max_spec_length), + "item_idx": item_idx, + } + + def get_data_loader( + self, config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: List, verbose: bool, num_gpus: int + ) -> "DataLoader": + if is_eval and not config.run_eval: + loader = None + else: + # setup multi-speaker attributes + if hasattr(self, "speaker_manager"): + speaker_id_mapping = self.speaker_manager.speaker_ids if config.use_speaker_embedding else None + d_vector_mapping = ( + self.speaker_manager.d_vectors + if config.use_speaker_embedding and config.use_d_vector_file + else None + ) + else: + speaker_id_mapping = None + d_vector_mapping = None + + # init dataloader + dataset = TTSDataset( + outputs_per_step=config.r if "r" in config else 1, + text_cleaner=config.text_cleaner, + compute_linear_spec=config.model.lower() == "tacotron", + meta_data=data_items, + ap=ap, + tp=config.characters, + add_blank=config["add_blank"], + batch_group_size=0 if is_eval else config.batch_group_size * config.batch_size, + min_seq_len=config.min_seq_len, + max_seq_len=config.max_seq_len, + phoneme_cache_path=config.phoneme_cache_path, + use_phonemes=config.use_phonemes, + phoneme_language=config.phoneme_language, + enable_eos_bos=config.enable_eos_bos_chars, + use_noise_augment=not is_eval, + verbose=verbose, + speaker_id_mapping=speaker_id_mapping, + d_vector_mapping=d_vector_mapping + if config.use_speaker_embedding and config.use_d_vector_file + else None, + ) + + if config.use_phonemes and config.compute_input_seq_cache: + # precompute phonemes to have a better estimate of sequence lengths. + dataset.compute_input_seq(config.num_loader_workers) + dataset.sort_items() + + sampler = DistributedSampler(dataset) if num_gpus > 1 else None + loader = DataLoader( + dataset, + batch_size=config.eval_batch_size if is_eval else config.batch_size, + shuffle=False, + collate_fn=dataset.collate_fn, + drop_last=False, + sampler=sampler, + num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers, + pin_memory=False, + ) + return loader + + def test_run(self) -> Tuple[Dict, Dict]: + """Generic test run for `tts` models used by `Trainer`. + + You can override this for a different behaviour. + + Returns: + Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard. + """ + print(" | > Synthesizing test sentences.") + test_audios = {} + test_figures = {} + test_sentences = self.config.test_sentences + aux_inputs = self._get_aux_inputs() + for idx, sen in enumerate(test_sentences): + wav, alignment, model_outputs, _ = synthesis( + self.model, + sen, + self.config, + self.use_cuda, + self.ap, + speaker_id=aux_inputs["speaker_id"], + d_vector=aux_inputs["d_vector"], + style_wav=aux_inputs["style_wav"], + enable_eos_bos_chars=self.config.enable_eos_bos_chars, + use_griffin_lim=True, + do_trim_silence=False, + ).values() + + test_audios["{}-audio".format(idx)] = wav + test_figures["{}-prediction".format(idx)] = plot_spectrogram(model_outputs, self.ap, output_fig=False) + test_figures["{}-alignment".format(idx)] = plot_alignment(alignment, output_fig=False) + return test_figures, test_audios diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index a30eadb4..ca2682dc 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -4,131 +4,89 @@ import torch from torch import nn from torch.nn import functional as F +from TTS.tts.configs import GlowTTSConfig from TTS.tts.layers.glow_tts.decoder import Decoder from TTS.tts.layers.glow_tts.encoder import Encoder from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path -from TTS.tts.models.abstract_tts import TTSModel +from TTS.tts.models.base_tts import BaseTTS from TTS.tts.utils.data import sequence_mask from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor -class GlowTTS(TTSModel): +class GlowTTS(BaseTTS): """Glow TTS models from https://arxiv.org/abs/2005.11129 - Args: - num_chars (int): number of embedding characters. - hidden_channels_enc (int): number of embedding and encoder channels. - hidden_channels_dec (int): number of decoder channels. - use_encoder_prenet (bool): enable/disable prenet for encoder. Prenet modules are hard-coded for each alternative encoder. - hidden_channels_dp (int): number of duration predictor channels. - out_channels (int): number of output channels. It should be equal to the number of spectrogram filter. - num_flow_blocks_dec (int): number of decoder blocks. - kernel_size_dec (int): decoder kernel size. - dilation_rate (int): rate to increase dilation by each layer in a decoder block. - num_block_layers (int): number of decoder layers in each decoder block. - dropout_p_dec (float): dropout rate for decoder. - num_speaker (int): number of speaker to define the size of speaker embedding layer. - c_in_channels (int): number of speaker embedding channels. It is set to 512 if embeddings are learned. - num_splits (int): number of split levels in inversible conv1x1 operation. - num_squeeze (int): number of squeeze levels. When squeezing channels increases and time steps reduces by the factor 'num_squeeze'. - sigmoid_scale (bool): enable/disable sigmoid scaling in decoder. - mean_only (bool): if True, encoder only computes mean value and uses constant variance for each time step. - encoder_type (str): encoder module type. - encoder_params (dict): encoder module parameters. - d_vector_dim (int): channels of external speaker embedding vectors. + Paper abstract: + Recently, text-to-speech (TTS) models such as FastSpeech and ParaNet have been proposed to generate + mel-spectrograms from text in parallel. Despite the advantage, the parallel TTS models cannot be trained + without guidance from autoregressive TTS models as their external aligners. In this work, we propose Glow-TTS, + a flow-based generative model for parallel TTS that does not require any external aligner. By combining the + properties of flows and dynamic programming, the proposed model searches for the most probable monotonic + alignment between text and the latent representation of speech on its own. We demonstrate that enforcing hard + monotonic alignments enables robust TTS, which generalizes to long utterances, and employing generative flows + enables fast, diverse, and controllable speech synthesis. Glow-TTS obtains an order-of-magnitude speed-up over + the autoregressive model, Tacotron 2, at synthesis with comparable speech quality. We further show that our + model can be easily extended to a multi-speaker setting. + + Check `GlowTTSConfig` for class arguments. """ - def __init__( - self, - num_chars, - hidden_channels_enc, - hidden_channels_dec, - use_encoder_prenet, - hidden_channels_dp, - out_channels, - num_flow_blocks_dec=12, - inference_noise_scale=0.33, - kernel_size_dec=5, - dilation_rate=5, - num_block_layers=4, - dropout_p_dp=0.1, - dropout_p_dec=0.05, - num_speakers=0, - c_in_channels=0, - num_splits=4, - num_squeeze=1, - sigmoid_scale=False, - mean_only=False, - encoder_type="transformer", - encoder_params=None, - d_vector_dim=None, - ): + def __init__(self, config: GlowTTSConfig): super().__init__() - self.num_chars = num_chars - self.hidden_channels_dp = hidden_channels_dp - self.hidden_channels_enc = hidden_channels_enc - self.hidden_channels_dec = hidden_channels_dec - self.out_channels = out_channels - self.num_flow_blocks_dec = num_flow_blocks_dec - self.kernel_size_dec = kernel_size_dec - self.dilation_rate = dilation_rate - self.num_block_layers = num_block_layers - self.dropout_p_dec = dropout_p_dec - self.num_speakers = num_speakers - self.c_in_channels = c_in_channels - self.num_splits = num_splits - self.num_squeeze = num_squeeze - self.sigmoid_scale = sigmoid_scale - self.mean_only = mean_only - self.use_encoder_prenet = use_encoder_prenet - self.inference_noise_scale = inference_noise_scale - # model constants. - self.noise_scale = 0.33 # defines the noise variance applied to the random z vector at inference. - self.length_scale = 1.0 # scaler for the duration predictor. The larger it is, the slower the speech. - self.d_vector_dim = d_vector_dim + chars, self.config = self.get_characters(config) + self.num_chars = len(chars) + self.decoder_output_dim = config.out_channels + self.init_multispeaker(config) + + # pass all config fields to `self` + # for fewer code change + self.config = config + for key in config: + setattr(self, key, config[key]) # if is a multispeaker and c_in_channels is 0, set to 256 - if num_speakers > 1: - if self.c_in_channels == 0 and not self.d_vector_dim: + self.c_in_channels = 0 + if self.num_speakers > 1: + if self.d_vector_dim: + self.c_in_channels = self.d_vector_dim + elif self.c_in_channels == 0 and not self.d_vector_dim: # TODO: make this adjustable self.c_in_channels = 256 - elif self.d_vector_dim: - self.c_in_channels = self.d_vector_dim self.encoder = Encoder( - num_chars, - out_channels=out_channels, - hidden_channels=hidden_channels_enc, - hidden_channels_dp=hidden_channels_dp, - encoder_type=encoder_type, - encoder_params=encoder_params, - mean_only=mean_only, - use_prenet=use_encoder_prenet, - dropout_p_dp=dropout_p_dp, + self.num_chars, + out_channels=self.out_channels, + hidden_channels=self.hidden_channels_enc, + hidden_channels_dp=self.hidden_channels_dp, + encoder_type=self.encoder_type, + encoder_params=self.encoder_params, + mean_only=self.mean_only, + use_prenet=self.use_encoder_prenet, + dropout_p_dp=self.dropout_p_dp, c_in_channels=self.c_in_channels, ) self.decoder = Decoder( - out_channels, - hidden_channels_dec, - kernel_size_dec, - dilation_rate, - num_flow_blocks_dec, - num_block_layers, - dropout_p=dropout_p_dec, - num_splits=num_splits, - num_squeeze=num_squeeze, - sigmoid_scale=sigmoid_scale, + self.out_channels, + self.hidden_channels_dec, + self.kernel_size_dec, + self.dilation_rate, + self.num_flow_blocks_dec, + self.num_block_layers, + dropout_p=self.dropout_p_dec, + num_splits=self.num_splits, + num_squeeze=self.num_squeeze, + sigmoid_scale=self.sigmoid_scale, c_in_channels=self.c_in_channels, ) - if num_speakers > 1 and not d_vector_dim: + if self.num_speakers > 1 and not self.d_vector_dim: # speaker embedding layer - self.emb_g = nn.Embedding(num_speakers, self.c_in_channels) + self.emb_g = nn.Embedding(self.num_speakers, self.c_in_channels) nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) @staticmethod @@ -377,7 +335,7 @@ class GlowTTS(TTSModel): # Sample audio train_audio = ap.inv_melspectrogram(pred_spec.T) - return figures, train_audio + return figures, {"audio": train_audio} def eval_step(self, batch: dict, criterion: nn.Module): return self.train_step(batch, criterion) @@ -406,3 +364,8 @@ class GlowTTS(TTSModel): self.eval() self.store_inverse() assert not self.training + + def get_criterion(self): + from TTS.tts.layers.losses import GlowTTSLoss # pylint: disable=import-outside-toplevel + + return GlowTTSLoss() diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index 44a47722..2eb70a6b 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -1,4 +1,7 @@ +from dataclasses import dataclass, field + import torch +from coqpit import Coqpit from torch import nn from TTS.tts.layers.feed_forward.decoder import Decoder @@ -6,25 +9,16 @@ from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path -from TTS.tts.models.abstract_tts import TTSModel +from TTS.tts.models.base_tts import BaseTTS from TTS.tts.utils.data import sequence_mask from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor -class SpeedySpeech(TTSModel): - """Speedy Speech model - https://arxiv.org/abs/2008.03802 - - Encoder -> DurationPredictor -> Decoder - - This model is able to achieve a reasonable performance with only - ~3M model parameters and convolutional layers. - - This model requires precomputed phoneme durations to train a duration predictor. At inference - it only uses the duration predictor to compute durations and expand encoder outputs respectively. - +@dataclass +class SpeedySpeechArgs(Coqpit): + """ Args: num_chars (int): number of unique input to characters out_channels (int): number of output tensor channels. It is equal to the expected spectrogram size. @@ -36,49 +30,107 @@ class SpeedySpeech(TTSModel): decoder_type (str, optional): decoder type. Defaults to 'residual_conv_bn'. decoder_params (dict, optional): set decoder parameters depending on 'decoder_type'. Defaults to { "kernel_size": 4, "dilations": 4 * [1, 2, 4, 8] + [1], "num_conv_blocks": 2, "num_res_blocks": 17 }. num_speakers (int, optional): number of speakers for multi-speaker training. Defaults to 0. - external_c (bool, optional): enable external speaker embeddings. Defaults to False. - c_in_channels (int, optional): number of channels in speaker embedding vectors. Defaults to 0. + use_d_vector (bool, optional): enable external speaker embeddings. Defaults to False. + d_vector_dim (int, optional): number of channels in speaker embedding vectors. Defaults to 0. """ - # pylint: disable=dangerous-default-value - - def __init__( - self, - num_chars, - out_channels, - hidden_channels, - positional_encoding=True, - length_scale=1, - encoder_type="residual_conv_bn", - encoder_params={"kernel_size": 4, "dilations": 4 * [1, 2, 4] + [1], "num_conv_blocks": 2, "num_res_blocks": 13}, - decoder_type="residual_conv_bn", - decoder_params={ + num_chars: int = None + out_channels: int = 80 + hidden_channels: int = 128 + num_speakers: int = 0 + positional_encoding: bool = True + length_scale: int = 1 + encoder_type: str = "residual_conv_bn" + encoder_params: dict = field( + default_factory=lambda: { + "kernel_size": 4, + "dilations": 4 * [1, 2, 4] + [1], + "num_conv_blocks": 2, + "num_res_blocks": 13, + } + ) + decoder_type: str = "residual_conv_bn" + decoder_params: dict = field( + default_factory=lambda: { "kernel_size": 4, "dilations": 4 * [1, 2, 4, 8] + [1], "num_conv_blocks": 2, "num_res_blocks": 17, - }, - num_speakers=0, - external_c=False, - c_in_channels=0, - ): + } + ) + use_d_vector: bool = False + d_vector_dim: int = 0 + +class SpeedySpeech(BaseTTS): + """Speedy Speech model + https://arxiv.org/abs/2008.03802 + + Encoder -> DurationPredictor -> Decoder + + Paper abstract: + While recent neural sequence-to-sequence models have greatly improved the quality of speech + synthesis, there has not been a system capable of fast training, fast inference and high-quality audio synthesis + at the same time. We propose a student-teacher network capable of high-quality faster-than-real-time spectrogram + synthesis, with low requirements on computational resources and fast training time. We show that self-attention + layers are not necessary for generation of high quality audio. We utilize simple convolutional blocks with + residual connections in both student and teacher networks and use only a single attention layer in the teacher + model. Coupled with a MelGAN vocoder, our model's voice quality was rated significantly higher than Tacotron 2. + Our model can be efficiently trained on a single GPU and can run in real time even on a CPU. We provide both + our source code and audio samples in our GitHub repository. + + Notes: + The vanilla model is able to achieve a reasonable performance with only + ~3M model parameters and convolutional layers. + + This model requires precomputed phoneme durations to train a duration predictor. At inference + it only uses the duration predictor to compute durations and expand encoder outputs respectively. + + You can also mix and match different encoder and decoder networks beyond the paper. + + Check `SpeedySpeechArgs` for arguments. + """ + + # pylint: disable=dangerous-default-value + + def __init__(self, config: Coqpit): super().__init__() - self.length_scale = float(length_scale) if isinstance(length_scale, int) else length_scale - self.emb = nn.Embedding(num_chars, hidden_channels) - self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, encoder_params, c_in_channels) - if positional_encoding: - self.pos_encoder = PositionalEncoding(hidden_channels) - self.decoder = Decoder(out_channels, hidden_channels, decoder_type, decoder_params) - self.duration_predictor = DurationPredictor(hidden_channels + c_in_channels) + self.config = config - if num_speakers > 1 and not external_c: + if "characters" in config: + chars, self.config = self.get_characters(config) + self.num_chars = len(chars) + + self.length_scale = ( + float(config.model_args.length_scale) + if isinstance(config.model_args.length_scale, int) + else config.model_args.length_scale + ) + self.emb = nn.Embedding(config.model_args.num_chars, config.model_args.hidden_channels) + self.encoder = Encoder( + config.model_args.hidden_channels, + config.model_args.hidden_channels, + config.model_args.encoder_type, + config.model_args.encoder_params, + config.model_args.d_vector_dim, + ) + if config.model_args.positional_encoding: + self.pos_encoder = PositionalEncoding(config.model_args.hidden_channels) + self.decoder = Decoder( + config.model_args.out_channels, + config.model_args.hidden_channels, + config.model_args.decoder_type, + config.model_args.decoder_params, + ) + self.duration_predictor = DurationPredictor(config.model_args.hidden_channels + config.model_args.d_vector_dim) + + if config.model_args.num_speakers > 1 and not config.model_args.use_d_vector: # speaker embedding layer - self.emb_g = nn.Embedding(num_speakers, c_in_channels) + self.emb_g = nn.Embedding(config.model_args.num_speakers, config.model_args.d_vector_dim) nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) - if c_in_channels > 0 and c_in_channels != hidden_channels: - self.proj_g = nn.Conv1d(c_in_channels, hidden_channels, 1) + if config.model_args.d_vector_dim > 0 and config.model_args.d_vector_dim != config.model_args.hidden_channels: + self.proj_g = nn.Conv1d(config.model_args.d_vector_dim, config.model_args.hidden_channels, 1) @staticmethod def expand_encoder_outputs(en, dr, x_mask, y_mask): @@ -244,7 +296,7 @@ class SpeedySpeech(TTSModel): # Sample audio train_audio = ap.inv_melspectrogram(pred_spec.T) - return figures, train_audio + return figures, {"audio": train_audio} def eval_step(self, batch: dict, criterion: nn.Module): return self.train_step(batch, criterion) @@ -260,3 +312,8 @@ class SpeedySpeech(TTSModel): if eval: self.eval() assert not self.training + + def get_criterion(self): + from TTS.tts.layers.losses import SpeedySpeechLoss # pylint: disable=import-outside-toplevel + + return SpeedySpeechLoss(self.config) diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 317d1905..95b4a358 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -1,166 +1,86 @@ # coding: utf-8 + +from typing import Dict, Tuple + import torch +from coqpit import Coqpit from torch import nn from TTS.tts.layers.tacotron.gst_layers import GST from TTS.tts.layers.tacotron.tacotron import Decoder, Encoder, PostCBHG -from TTS.tts.models.tacotron_abstract import TacotronAbstract +from TTS.tts.models.base_tacotron import BaseTacotron from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor -class Tacotron(TacotronAbstract): +class Tacotron(BaseTacotron): """Tacotron as in https://arxiv.org/abs/1703.10135 - It's an autoregressive encoder-attention-decoder-postnet architecture. - - Args: - num_chars (int): number of input characters to define the size of embedding layer. - num_speakers (int): number of speakers in the dataset. >1 enables multi-speaker training and model learns speaker embeddings. - r (int): initial model reduction rate. - postnet_output_dim (int, optional): postnet output channels. Defaults to 80. - decoder_output_dim (int, optional): decoder output channels. Defaults to 80. - attn_type (str, optional): attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'. - attn_win (bool, optional): enable/disable attention windowing. - It especially useful at inference to keep attention alignment diagonal. Defaults to False. - attn_norm (str, optional): Attention normalization method. "sigmoid" or "softmax". Defaults to "softmax". - prenet_type (str, optional): prenet type for the decoder. Defaults to "original". - prenet_dropout (bool, optional): prenet dropout rate. Defaults to True. - prenet_dropout_at_inference (bool, optional): use dropout at inference time. This leads to a better quality for - some models. Defaults to False. - forward_attn (bool, optional): enable/disable forward attention. - It is only valid if ```attn_type``` is ```original```. Defaults to False. - trans_agent (bool, optional): enable/disable transition agent in forward attention. Defaults to False. - forward_attn_mask (bool, optional): enable/disable extra masking over forward attention. Defaults to False. - location_attn (bool, optional): enable/disable location sensitive attention. - It is only valid if ```attn_type``` is ```original```. Defaults to True. - attn_K (int, optional): Number of attention heads for GMM attention. Defaults to 5. - separate_stopnet (bool, optional): enable/disable separate stopnet training without only gradient - flow from stopnet to the rest of the model. Defaults to True. - bidirectional_decoder (bool, optional): enable/disable bidirectional decoding. Defaults to False. - double_decoder_consistency (bool, optional): enable/disable double decoder consistency. Defaults to False. - ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None. - encoder_in_features (int, optional): input channels for the encoder. Defaults to 512. - decoder_in_features (int, optional): input channels for the decoder. Defaults to 512. - d_vector_dim (int, optional): external speaker conditioning vector channels. Defaults to None. - use_gst (bool, optional): enable/disable Global style token module. - gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. - memory_size (int, optional): size of the history queue fed to the prenet. Model feeds the last ```memory_size``` - output frames to the prenet. - gradual_trainin (List): Gradual training schedule. If None or `[]`, no gradual training is used. - Defaults to `[]`. - max_decoder_steps (int): Maximum number of steps allowed for the decoder. Defaults to 10000. + Check `TacotronConfig` for the arguments. """ - def __init__( - self, - num_chars, - num_speakers, - r=5, - postnet_output_dim=1025, - decoder_output_dim=80, - attn_type="original", - attn_win=False, - attn_norm="sigmoid", - prenet_type="original", - prenet_dropout=True, - prenet_dropout_at_inference=False, - forward_attn=False, - trans_agent=False, - forward_attn_mask=False, - location_attn=True, - attn_K=5, - separate_stopnet=True, - bidirectional_decoder=False, - double_decoder_consistency=False, - ddc_r=None, - encoder_in_features=256, - decoder_in_features=256, - d_vector_dim=None, - use_gst=False, - gst=None, - memory_size=5, - gradual_training=None, - max_decoder_steps=500, - ): - super().__init__( - num_chars, - num_speakers, - r, - postnet_output_dim, - decoder_output_dim, - attn_type, - attn_win, - attn_norm, - prenet_type, - prenet_dropout, - prenet_dropout_at_inference, - forward_attn, - trans_agent, - forward_attn_mask, - location_attn, - attn_K, - separate_stopnet, - bidirectional_decoder, - double_decoder_consistency, - ddc_r, - encoder_in_features, - decoder_in_features, - d_vector_dim, - use_gst, - gst, - gradual_training, - ) + def __init__(self, config: Coqpit): + super().__init__(config) - # speaker embedding layers + self.num_chars, self.config = self.get_characters(config) + + # pass all config fields to `self` + # for fewer code change + for key in config: + setattr(self, key, config[key]) + + # speaker embedding layer if self.num_speakers > 1: - if not self.use_d_vectors: - d_vector_dim = 256 - self.speaker_embedding = nn.Embedding(self.num_speakers, d_vector_dim) - self.speaker_embedding.weight.data.normal_(0, 0.3) + self.init_multispeaker(config) # speaker and gst embeddings is concat in decoder input if self.num_speakers > 1: - self.decoder_in_features += d_vector_dim # add speaker embedding dim + self.decoder_in_features += self.embedded_speaker_dim # add speaker embedding dim + + if self.use_gst: + self.decoder_in_features += self.gst.gst_embedding_dim # embedding layer - self.embedding = nn.Embedding(num_chars, 256, padding_idx=0) + self.embedding = nn.Embedding(self.num_chars, 256, padding_idx=0) self.embedding.weight.data.normal_(0, 0.3) # base model layers self.encoder = Encoder(self.encoder_in_features) self.decoder = Decoder( self.decoder_in_features, - decoder_output_dim, - r, - memory_size, - attn_type, - attn_win, - attn_norm, - prenet_type, - prenet_dropout, - forward_attn, - trans_agent, - forward_attn_mask, - location_attn, - attn_K, - separate_stopnet, - max_decoder_steps, + self.decoder_output_dim, + self.r, + self.memory_size, + self.attention_type, + self.windowing, + self.attention_norm, + self.prenet_type, + self.prenet_dropout, + self.use_forward_attn, + self.transition_agent, + self.forward_attn_mask, + self.location_attn, + self.attention_heads, + self.separate_stopnet, + self.max_decoder_steps, ) - self.postnet = PostCBHG(decoder_output_dim) - self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, postnet_output_dim) + self.postnet = PostCBHG(self.decoder_output_dim) + self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, self.out_channels) # setup prenet dropout - self.decoder.prenet.dropout_at_inference = prenet_dropout_at_inference + self.decoder.prenet.dropout_at_inference = self.prenet_dropout_at_inference # global style token layers if self.gst and self.use_gst: self.gst_layer = GST( - num_mel=decoder_output_dim, - d_vector_dim=d_vector_dim, - num_heads=gst.gst_num_heads, - num_style_tokens=gst.gst_num_style_tokens, - gst_embedding_dim=gst.gst_embedding_dim, + num_mel=self.decoder_output_dim, + d_vector_dim=self.d_vector_dim + if self.config.gst.gst_use_speaker_embedding and self.use_speaker_embedding + else None, + num_heads=self.gst.gst_num_heads, + num_style_tokens=self.gst.gst_num_style_tokens, + gst_embedding_dim=self.gst.gst_embedding_dim, ) # backward pass decoder if self.bidirectional_decoder: @@ -169,21 +89,21 @@ class Tacotron(TacotronAbstract): if self.double_decoder_consistency: self.coarse_decoder = Decoder( self.decoder_in_features, - decoder_output_dim, - ddc_r, - memory_size, - attn_type, - attn_win, - attn_norm, - prenet_type, - prenet_dropout, - forward_attn, - trans_agent, - forward_attn_mask, - location_attn, - attn_K, - separate_stopnet, - max_decoder_steps, + self.decoder_output_dim, + self.ddc_r, + self.memory_size, + self.attention_type, + self.windowing, + self.attention_norm, + self.prenet_type, + self.prenet_dropout, + self.use_forward_attn, + self.transition_agent, + self.forward_attn_mask, + self.location_attn, + self.attention_heads, + self.separate_stopnet, + self.max_decoder_steps, ) def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, aux_input=None): @@ -205,7 +125,9 @@ class Tacotron(TacotronAbstract): # global style token if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, aux_input["d_vectors"]) + encoder_outputs = self.compute_gst( + encoder_outputs, mel_specs, aux_input["d_vectors"] if "d_vectors" in aux_input else None + ) # speaker embedding if self.num_speakers > 1: if not self.use_d_vectors: @@ -341,7 +263,7 @@ class Tacotron(TacotronAbstract): loss_dict["align_error"] = align_error return outputs, loss_dict - def train_log(self, ap, batch, outputs): + def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict) -> Tuple[Dict, Dict]: postnet_outputs = outputs["model_outputs"] alignments = outputs["alignments"] alignments_backward = outputs["alignments_backward"] @@ -362,7 +284,7 @@ class Tacotron(TacotronAbstract): # Sample audio train_audio = ap.inv_spectrogram(pred_spec.T) - return figures, train_audio + return figures, {"audio": train_audio} def eval_step(self, batch, criterion): return self.train_step(batch, criterion) diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index d56bd988..eaca3ff8 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -1,160 +1,84 @@ # coding: utf-8 + +from typing import Dict, Tuple + import torch +from coqpit import Coqpit from torch import nn from TTS.tts.layers.tacotron.gst_layers import GST from TTS.tts.layers.tacotron.tacotron2 import Decoder, Encoder, Postnet -from TTS.tts.models.tacotron_abstract import TacotronAbstract +from TTS.tts.models.base_tacotron import BaseTacotron from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor -class Tacotron2(TacotronAbstract): +class Tacotron2(BaseTacotron): """Tacotron2 as in https://arxiv.org/abs/1712.05884 - - It's an autoregressive encoder-attention-decoder-postnet architecture. - - Args: - num_chars (int): number of input characters to define the size of embedding layer. - num_speakers (int): number of speakers in the dataset. >1 enables multi-speaker training and model learns speaker embeddings. - r (int): initial model reduction rate. - postnet_output_dim (int, optional): postnet output channels. Defaults to 80. - decoder_output_dim (int, optional): decoder output channels. Defaults to 80. - attn_type (str, optional): attention type. Check ```TTS.tts.layers.tacotron.common_layers.init_attn```. Defaults to 'original'. - attn_win (bool, optional): enable/disable attention windowing. - It especially useful at inference to keep attention alignment diagonal. Defaults to False. - attn_norm (str, optional): Attention normalization method. "sigmoid" or "softmax". Defaults to "softmax". - prenet_type (str, optional): prenet type for the decoder. Defaults to "original". - prenet_dropout (bool, optional): prenet dropout rate. Defaults to True. - prenet_dropout_at_inference (bool, optional): use dropout at inference time. This leads to a better quality for - some models. Defaults to False. - forward_attn (bool, optional): enable/disable forward attention. - It is only valid if ```attn_type``` is ```original```. Defaults to False. - trans_agent (bool, optional): enable/disable transition agent in forward attention. Defaults to False. - forward_attn_mask (bool, optional): enable/disable extra masking over forward attention. Defaults to False. - location_attn (bool, optional): enable/disable location sensitive attention. - It is only valid if ```attn_type``` is ```original```. Defaults to True. - attn_K (int, optional): Number of attention heads for GMM attention. Defaults to 5. - separate_stopnet (bool, optional): enable/disable separate stopnet training without only gradient - flow from stopnet to the rest of the model. Defaults to True. - bidirectional_decoder (bool, optional): enable/disable bidirectional decoding. Defaults to False. - double_decoder_consistency (bool, optional): enable/disable double decoder consistency. Defaults to False. - ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None. - encoder_in_features (int, optional): input channels for the encoder. Defaults to 512. - decoder_in_features (int, optional): input channels for the decoder. Defaults to 512. - d_vector_dim (int, optional): external speaker conditioning vector channels. Defaults to None. - use_gst (bool, optional): enable/disable Global style token module. - gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. - gradual_training (List): Gradual training schedule. If None or `[]`, no gradual training is used. - Defaults to `[]`. - max_decoder_steps (int): Maximum number of steps allowed for the decoder. Defaults to 10000. + Check `TacotronConfig` for the arguments. """ - def __init__( - self, - num_chars, - num_speakers, - r, - postnet_output_dim=80, - decoder_output_dim=80, - attn_type="original", - attn_win=False, - attn_norm="softmax", - prenet_type="original", - prenet_dropout=True, - prenet_dropout_at_inference=False, - forward_attn=False, - trans_agent=False, - forward_attn_mask=False, - location_attn=True, - attn_K=5, - separate_stopnet=True, - bidirectional_decoder=False, - double_decoder_consistency=False, - ddc_r=None, - encoder_in_features=512, - decoder_in_features=512, - d_vector_dim=None, - use_gst=False, - gst=None, - gradual_training=None, - max_decoder_steps=500, - ): - super().__init__( - num_chars, - num_speakers, - r, - postnet_output_dim, - decoder_output_dim, - attn_type, - attn_win, - attn_norm, - prenet_type, - prenet_dropout, - prenet_dropout_at_inference, - forward_attn, - trans_agent, - forward_attn_mask, - location_attn, - attn_K, - separate_stopnet, - bidirectional_decoder, - double_decoder_consistency, - ddc_r, - encoder_in_features, - decoder_in_features, - d_vector_dim, - use_gst, - gst, - gradual_training, - ) + def __init__(self, config: Coqpit): + super().__init__(config) + + chars, self.config = self.get_characters(config) + self.num_chars = len(chars) + self.decoder_output_dim = config.out_channels + + # pass all config fields to `self` + # for fewer code change + for key in config: + setattr(self, key, config[key]) # speaker embedding layer if self.num_speakers > 1: - if not self.use_d_vectors: - d_vector_dim = 512 - self.speaker_embedding = nn.Embedding(self.num_speakers, d_vector_dim) - self.speaker_embedding.weight.data.normal_(0, 0.3) + self.init_multispeaker(config) # speaker and gst embeddings is concat in decoder input if self.num_speakers > 1: - self.decoder_in_features += d_vector_dim # add speaker embedding dim + self.decoder_in_features += self.embedded_speaker_dim # add speaker embedding dim + + if self.use_gst: + self.decoder_in_features += self.gst.gst_embedding_dim # embedding layer - self.embedding = nn.Embedding(num_chars, 512, padding_idx=0) + self.embedding = nn.Embedding(self.num_chars, 512, padding_idx=0) # base model layers self.encoder = Encoder(self.encoder_in_features) self.decoder = Decoder( self.decoder_in_features, self.decoder_output_dim, - r, - attn_type, - attn_win, - attn_norm, - prenet_type, - prenet_dropout, - forward_attn, - trans_agent, - forward_attn_mask, - location_attn, - attn_K, - separate_stopnet, - max_decoder_steps, + self.r, + self.attention_type, + self.attention_win, + self.attention_norm, + self.prenet_type, + self.prenet_dropout, + self.use_forward_attn, + self.transition_agent, + self.forward_attn_mask, + self.location_attn, + self.attention_heads, + self.separate_stopnet, + self.max_decoder_steps, ) - self.postnet = Postnet(self.postnet_output_dim) + self.postnet = Postnet(self.out_channels) # setup prenet dropout - self.decoder.prenet.dropout_at_g = prenet_dropout_at_inference + self.decoder.prenet.dropout_at_inference = self.prenet_dropout_at_inference # global style token layers - if self.gst and use_gst: + if self.gst and self.use_gst: self.gst_layer = GST( - num_mel=decoder_output_dim, - d_vector_dim=d_vector_dim, - num_heads=gst.gst_num_heads, - num_style_tokens=gst.gst_num_style_tokens, - gst_embedding_dim=gst.gst_embedding_dim, + num_mel=self.decoder_output_dim, + d_vector_dim=self.d_vector_dim + if self.config.gst.gst_use_speaker_embedding and self.use_speaker_embedding + else None, + num_heads=self.gst.gst_num_heads, + num_style_tokens=self.gst.gst_num_style_tokens, + gst_embedding_dim=self.gst.gst_embedding_dim, ) # backward pass decoder @@ -165,19 +89,19 @@ class Tacotron2(TacotronAbstract): self.coarse_decoder = Decoder( self.decoder_in_features, self.decoder_output_dim, - ddc_r, - attn_type, - attn_win, - attn_norm, - prenet_type, - prenet_dropout, - forward_attn, - trans_agent, - forward_attn_mask, - location_attn, - attn_K, - separate_stopnet, - max_decoder_steps, + self.ddc_r, + self.attention_type, + self.attention_win, + self.attention_norm, + self.prenet_type, + self.prenet_dropout, + self.use_forward_attn, + self.transition_agent, + self.forward_attn_mask, + self.location_attn, + self.attention_heads, + self.separate_stopnet, + self.max_decoder_steps, ) @staticmethod @@ -206,7 +130,9 @@ class Tacotron2(TacotronAbstract): encoder_outputs = self.encoder(embedded_inputs, text_lengths) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, aux_input["d_vectors"]) + encoder_outputs = self.compute_gst( + encoder_outputs, mel_specs, aux_input["d_vectors"] if "d_vectors" in aux_input else None + ) if self.num_speakers > 1: if not self.use_d_vectors: # B x 1 x speaker_embed_dim @@ -342,7 +268,7 @@ class Tacotron2(TacotronAbstract): loss_dict["align_error"] = align_error return outputs, loss_dict - def train_log(self, ap, batch, outputs): + def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict) -> Tuple[Dict, Dict]: postnet_outputs = outputs["model_outputs"] alignments = outputs["alignments"] alignments_backward = outputs["alignments_backward"] @@ -363,7 +289,7 @@ class Tacotron2(TacotronAbstract): # Sample audio train_audio = ap.inv_melspectrogram(pred_spec.T) - return figures, train_audio + return figures, {"audio": train_audio} def eval_step(self, batch, criterion): return self.train_step(batch, criterion) diff --git a/TTS/tts/tf/models/tacotron2.py b/TTS/tts/tf/models/tacotron2.py index 9cc62070..7a1d695d 100644 --- a/TTS/tts/tf/models/tacotron2.py +++ b/TTS/tts/tf/models/tacotron2.py @@ -12,7 +12,7 @@ class Tacotron2(keras.models.Model): num_chars, num_speakers, r, - postnet_output_dim=80, + out_channels=80, decoder_output_dim=80, attn_type="original", attn_win=False, @@ -31,7 +31,7 @@ class Tacotron2(keras.models.Model): super().__init__() self.r = r self.decoder_output_dim = decoder_output_dim - self.postnet_output_dim = postnet_output_dim + self.out_channels = out_channels self.bidirectional_decoder = bidirectional_decoder self.num_speakers = num_speakers self.speaker_embed_dim = 256 @@ -58,7 +58,7 @@ class Tacotron2(keras.models.Model): name="decoder", enable_tflite=enable_tflite, ) - self.postnet = Postnet(postnet_output_dim, 5, name="postnet") + self.postnet = Postnet(out_channels, 5, name="postnet") @tf.function(experimental_relax_shapes=True) def call(self, characters, text_lengths=None, frames=None, training=None): diff --git a/TTS/vocoder/models/base_vocoder.py b/TTS/vocoder/models/base_vocoder.py new file mode 100644 index 00000000..f879cd42 --- /dev/null +++ b/TTS/vocoder/models/base_vocoder.py @@ -0,0 +1,20 @@ +from TTS.model import BaseModel + +# pylint: skip-file + + +class BaseVocoder(BaseModel): + """Base `vocoder` class. Every new `vocoder` model must inherit this. + + It defines `vocoder` specific functions on top of `Model`. + + Notes on input/output tensor shapes: + Any input or output tensor of the model must be shaped as + + - 3D tensors `batch x time x channels` + - 2D tensors `batch x channels` + - 1D tensors `batch x 1` + """ + + def __init__(self): + super().__init__() From d10f9c567696a9c552bf320d5755c35a324330e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:36:47 +0200 Subject: [PATCH 087/258] Update `tts.models.setup_model` --- TTS/tts/models/__init__.py | 144 ++++++++++--------------------------- 1 file changed, 38 insertions(+), 106 deletions(-) diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py index 2a951267..c6390beb 100644 --- a/TTS/tts/models/__init__.py +++ b/TTS/tts/models/__init__.py @@ -1,110 +1,42 @@ +from TTS.tts.utils.text.symbols import make_symbols, parse_symbols from TTS.utils.generic_utils import find_module -def setup_model(num_chars, num_speakers, c, d_vector_dim=None): - print(" > Using model: {}".format(c.model)) - MyModel = find_module("TTS.tts.models", c.model.lower()) - if c.model.lower() in "tacotron": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - num_speakers=num_speakers, - r=c.r, - postnet_output_dim=int(c.audio["fft_size"] / 2 + 1), - decoder_output_dim=c.audio["num_mels"], - use_gst=c.use_gst, - gst=c.gst, - memory_size=c.memory_size, - attn_type=c.attention_type, - attn_win=c.windowing, - attn_norm=c.attention_norm, - prenet_type=c.prenet_type, - prenet_dropout=c.prenet_dropout, - prenet_dropout_at_inference=c.prenet_dropout_at_inference, - forward_attn=c.use_forward_attn, - trans_agent=c.transition_agent, - forward_attn_mask=c.forward_attn_mask, - location_attn=c.location_attn, - attn_K=c.attention_heads, - separate_stopnet=c.separate_stopnet, - bidirectional_decoder=c.bidirectional_decoder, - double_decoder_consistency=c.double_decoder_consistency, - ddc_r=c.ddc_r, - d_vector_dim=d_vector_dim, - max_decoder_steps=c.max_decoder_steps, - ) - elif c.model.lower() == "tacotron2": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - num_speakers=num_speakers, - r=c.r, - postnet_output_dim=c.audio["num_mels"], - decoder_output_dim=c.audio["num_mels"], - use_gst=c.use_gst, - gst=c.gst, - attn_type=c.attention_type, - attn_win=c.windowing, - attn_norm=c.attention_norm, - prenet_type=c.prenet_type, - prenet_dropout=c.prenet_dropout, - prenet_dropout_at_inference=c.prenet_dropout_at_inference, - forward_attn=c.use_forward_attn, - trans_agent=c.transition_agent, - forward_attn_mask=c.forward_attn_mask, - location_attn=c.location_attn, - attn_K=c.attention_heads, - separate_stopnet=c.separate_stopnet, - bidirectional_decoder=c.bidirectional_decoder, - double_decoder_consistency=c.double_decoder_consistency, - ddc_r=c.ddc_r, - d_vector_dim=d_vector_dim, - max_decoder_steps=c.max_decoder_steps, - ) - elif c.model.lower() == "glow_tts": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - hidden_channels_enc=c["hidden_channels_encoder"], - hidden_channels_dec=c["hidden_channels_decoder"], - hidden_channels_dp=c["hidden_channels_duration_predictor"], - out_channels=c.audio["num_mels"], - encoder_type=c.encoder_type, - encoder_params=c.encoder_params, - use_encoder_prenet=c["use_encoder_prenet"], - inference_noise_scale=c.inference_noise_scale, - num_flow_blocks_dec=12, - kernel_size_dec=5, - dilation_rate=1, - num_block_layers=4, - dropout_p_dec=0.05, - num_speakers=num_speakers, - c_in_channels=0, - num_splits=4, - num_squeeze=2, - sigmoid_scale=False, - mean_only=True, - d_vector_dim=d_vector_dim, - ) - elif c.model.lower() == "speedy_speech": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - out_channels=c.audio["num_mels"], - hidden_channels=c["hidden_channels"], - positional_encoding=c["positional_encoding"], - encoder_type=c["encoder_type"], - encoder_params=c["encoder_params"], - decoder_type=c["decoder_type"], - decoder_params=c["decoder_params"], - c_in_channels=0, - ) - elif c.model.lower() == "align_tts": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - out_channels=c.audio["num_mels"], - hidden_channels=c["hidden_channels"], - hidden_channels_dp=c["hidden_channels_dp"], - encoder_type=c["encoder_type"], - encoder_params=c["encoder_params"], - decoder_type=c["decoder_type"], - decoder_params=c["decoder_params"], - c_in_channels=0, - ) +def setup_model(config): + print(" > Using model: {}".format(config.model)) + + MyModel = find_module("TTS.tts.models", config.model.lower()) + # define set of characters used by the model + if config.characters is not None: + # set characters from config + symbols, phonemes = make_symbols(**config.characters.to_dict()) # pylint: disable=redefined-outer-name + else: + from TTS.tts.utils.text.symbols import phonemes, symbols # pylint: disable=import-outside-toplevel + + # use default characters and assign them to config + config.characters = parse_symbols() + num_chars = len(phonemes) if config.use_phonemes else len(symbols) + # consider special `blank` character if `add_blank` is set True + num_chars = num_chars + getattr(config, "add_blank", False) + config.num_chars = num_chars + # compatibility fix + if "model_params" in config: + config.model_params.num_chars = num_chars + if "model_args" in config: + config.model_args.num_chars = num_chars + model = MyModel(config) return model + + +# TODO; class registery +# def import_models(models_dir, namespace): +# for file in os.listdir(models_dir): +# path = os.path.join(models_dir, file) +# if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)): +# model_name = file[: file.find(".py")] if file.endswith(".py") else file +# importlib.import_module(namespace + "." + model_name) +# +# +## automatically import any Python files in the models/ directory +# models_dir = os.path.dirname(__file__) +# import_models(models_dir, "TTS.tts.models") From 420820caf4f0ad52b232408ec564b8bbb15afcab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:37:23 +0200 Subject: [PATCH 088/258] Update vocoder models --- TTS/vocoder/models/gan.py | 246 +++++++++++++++++++++++++ TTS/vocoder/models/wavegrad.py | 190 ++++++++++++++++--- TTS/vocoder/models/wavernn.py | 322 ++++++++++++++++++++++++--------- 3 files changed, 639 insertions(+), 119 deletions(-) create mode 100644 TTS/vocoder/models/gan.py diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py new file mode 100644 index 00000000..58d6532e --- /dev/null +++ b/TTS/vocoder/models/gan.py @@ -0,0 +1,246 @@ +from inspect import signature +from typing import Dict, List, Tuple + +import numpy as np +import torch +from coqpit import Coqpit +from torch import nn +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler + +from TTS.utils.audio import AudioProcessor +from TTS.utils.trainer_utils import get_optimizer, get_scheduler +from TTS.vocoder.datasets.gan_dataset import GANDataset +from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss +from TTS.vocoder.models import setup_discriminator, setup_generator +from TTS.vocoder.models.base_vocoder import BaseVocoder +from TTS.vocoder.utils.generic_utils import plot_results + + +class GAN(BaseVocoder): + def __init__(self, config: Coqpit): + """Wrap a generator and a discriminator network. It provides a compatible interface for the trainer. + It also helps mixing and matching different generator and disciminator networks easily. + + Args: + config (Coqpit): Model configuration. + + Examples: + Initializing the GAN model with HifiGAN generator and discriminator. + >>> from TTS.vocoder.configs import HifiganConfig + >>> config = HifiganConfig() + >>> model = GAN(config) + """ + super().__init__() + self.config = config + self.model_g = setup_generator(config) + self.model_d = setup_discriminator(config) + self.train_disc = False # if False, train only the generator. + self.y_hat_g = None # the last generator prediction to be passed onto the discriminator + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.model_g.forward(x) + + def inference(self, x: torch.Tensor) -> torch.Tensor: + return self.model_g.inference(x) + + def train_step(self, batch: Dict, criterion: Dict, optimizer_idx: int) -> Tuple[Dict, Dict]: + outputs = None + loss_dict = None + + x = batch["input"] + y = batch["waveform"] + + if optimizer_idx not in [0, 1]: + raise ValueError(" [!] Unexpected `optimizer_idx`.") + + if optimizer_idx == 0: + # GENERATOR + # generator pass + y_hat = self.model_g(x)[:, :, : y.size(2)] + self.y_hat_g = y_hat # save for discriminator + y_hat_sub = None + y_sub = None + + # PQMF formatting + if y_hat.shape[1] > 1: + y_hat_sub = y_hat + y_hat = self.model_g.pqmf_synthesis(y_hat) + self.y_hat_g = y_hat # save for discriminator + y_sub = self.model_g.pqmf_analysis(y) + + scores_fake, feats_fake, feats_real = None, None, None + if self.train_disc: + + if len(signature(self.model_d.forward).parameters) == 2: + D_out_fake = self.model_d(y_hat, x) + else: + D_out_fake = self.model_d(y_hat) + D_out_real = None + + if self.config.use_feat_match_loss: + with torch.no_grad(): + D_out_real = self.model_d(y) + + # format D outputs + if isinstance(D_out_fake, tuple): + scores_fake, feats_fake = D_out_fake + if D_out_real is None: + feats_real = None + else: + _, feats_real = D_out_real + else: + scores_fake = D_out_fake + feats_fake, feats_real = None, None + + # compute losses + loss_dict = criterion[optimizer_idx](y_hat, y, scores_fake, feats_fake, feats_real, y_hat_sub, y_sub) + outputs = {"model_outputs": y_hat} + + if optimizer_idx == 1: + # DISCRIMINATOR + if self.train_disc: + # use different samples for G and D trainings + if self.config.diff_samples_for_G_and_D: + x_d = batch["input_disc"] + y_d = batch["waveform_disc"] + # use a different sample than generator + with torch.no_grad(): + y_hat = self.model_g(x_d) + + # PQMF formatting + if y_hat.shape[1] > 1: + y_hat = self.model_g.pqmf_synthesis(y_hat) + else: + # use the same samples as generator + x_d = x.clone() + y_d = y.clone() + y_hat = self.y_hat_g + + # run D with or without cond. features + if len(signature(self.model_d.forward).parameters) == 2: + D_out_fake = self.model_d(y_hat.detach().clone(), x_d) + D_out_real = self.model_d(y_d, x_d) + else: + D_out_fake = self.model_d(y_hat.detach()) + D_out_real = self.model_d(y_d) + + # format D outputs + if isinstance(D_out_fake, tuple): + # self.model_d returns scores and features + scores_fake, feats_fake = D_out_fake + if D_out_real is None: + scores_real, feats_real = None, None + else: + scores_real, feats_real = D_out_real + else: + # model D returns only scores + scores_fake = D_out_fake + scores_real = D_out_real + + # compute losses + loss_dict = criterion[optimizer_idx](scores_fake, scores_real) + outputs = {"model_outputs": y_hat} + + return outputs, loss_dict + + def train_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + y_hat = outputs[0]["model_outputs"] + y = batch["waveform"] + figures = plot_results(y_hat, y, ap, "train") + sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() + audios = {"train/audio": sample_voice} + return figures, audios + + @torch.no_grad() + def eval_step(self, batch: Dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]: + return self.train_step(batch, criterion, optimizer_idx) + + def eval_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + return self.train_log(ap, batch, outputs) + + def load_checkpoint( + self, + config: Coqpit, + checkpoint_path: str, + eval: bool = False, # pylint: disable=unused-argument, redefined-builtin + ) -> None: + state = torch.load(checkpoint_path, map_location=torch.device("cpu")) + # band-aid for older than v0.0.15 GAN models + if "model_disc" in state: + self.model_g.load_checkpoint(config, checkpoint_path, eval) + else: + self.load_state_dict(state["model"]) + if eval: + self.model_d = None + if hasattr(self.model_g, "remove_weight_norm"): + self.model_g.remove_weight_norm() + + def on_train_step_start(self, trainer) -> None: + self.train_disc = trainer.total_steps_done >= self.config.steps_to_start_discriminator + + def get_optimizer(self): + optimizer1 = get_optimizer( + self.config.optimizer, self.config.optimizer_params, self.config.lr_gen, self.model_g + ) + optimizer2 = get_optimizer( + self.config.optimizer, self.config.optimizer_params, self.config.lr_disc, self.model_d + ) + return [optimizer1, optimizer2] + + def get_lr(self): + return [self.config.lr_gen, self.config.lr_disc] + + def get_scheduler(self, optimizer): + scheduler1 = get_scheduler(self.config.lr_scheduler_gen, self.config.lr_scheduler_gen_params, optimizer[0]) + scheduler2 = get_scheduler(self.config.lr_scheduler_disc, self.config.lr_scheduler_disc_params, optimizer[1]) + return [scheduler1, scheduler2] + + @staticmethod + def format_batch(batch): + if isinstance(batch[0], list): + x_G, y_G = batch[0] + x_D, y_D = batch[1] + return {"input": x_G, "waveform": y_G, "input_disc": x_D, "waveform_disc": y_D} + x, y = batch + return {"input": x, "waveform": y} + + def get_data_loader( # pylint: disable=no-self-use + self, + config: Coqpit, + ap: AudioProcessor, + is_eval: True, + data_items: List, + verbose: bool, + num_gpus: int, + ): + dataset = GANDataset( + ap=ap, + items=data_items, + seq_len=config.seq_len, + hop_len=ap.hop_length, + pad_short=config.pad_short, + conv_pad=config.conv_pad, + return_pairs=config.diff_samples_for_G_and_D if "diff_samples_for_G_and_D" in config else False, + is_training=not is_eval, + return_segments=not is_eval, + use_noise_augment=config.use_noise_augment, + use_cache=config.use_cache, + verbose=verbose, + ) + dataset.shuffle_mapping() + sampler = DistributedSampler(dataset, shuffle=True) if num_gpus > 1 else None + loader = DataLoader( + dataset, + batch_size=1 if is_eval else config.batch_size, + shuffle=num_gpus == 0, + drop_last=False, + sampler=sampler, + num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers, + pin_memory=False, + ) + return loader + + def get_criterion(self): + """Return criterions for the optimizers""" + return [GeneratorLoss(self.config), DiscriminatorLoss(self.config)] diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py index 84dde957..03d5160e 100644 --- a/TTS/vocoder/models/wavegrad.py +++ b/TTS/vocoder/models/wavegrad.py @@ -1,65 +1,105 @@ +from dataclasses import dataclass, field +from typing import Dict, List, Tuple + import numpy as np import torch +from coqpit import Coqpit from torch import nn from torch.nn.utils import weight_norm +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler -from ..layers.wavegrad import Conv1d, DBlock, FiLM, UBlock +from TTS.model import BaseModel +from TTS.utils.audio import AudioProcessor +from TTS.utils.trainer_utils import get_optimizer, get_scheduler +from TTS.vocoder.datasets import WaveGradDataset +from TTS.vocoder.layers.wavegrad import Conv1d, DBlock, FiLM, UBlock +from TTS.vocoder.utils.generic_utils import plot_results -class Wavegrad(nn.Module): +@dataclass +class WavegradArgs(Coqpit): + in_channels: int = 80 + out_channels: int = 1 + use_weight_norm: bool = False + y_conv_channels: int = 32 + x_conv_channels: int = 768 + dblock_out_channels: List[int] = field(default_factory=lambda: [128, 128, 256, 512]) + ublock_out_channels: List[int] = field(default_factory=lambda: [512, 512, 256, 128, 128]) + upsample_factors: List[int] = field(default_factory=lambda: [4, 4, 4, 2, 2]) + upsample_dilations: List[List[int]] = field( + default_factory=lambda: [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]] + ) + + +class Wavegrad(BaseModel): + """🐸 🌊 WaveGrad 🌊 model. + Paper - https://arxiv.org/abs/2009.00713 + + Examples: + Initializing the model. + + >>> from TTS.vocoder.configs import WavegradConfig + >>> config = WavegradConfig() + >>> model = Wavegrad(config) + + Paper Abstract: + This paper introduces WaveGrad, a conditional model for waveform generation which estimates gradients of the + data density. The model is built on prior work on score matching and diffusion probabilistic models. It starts + from a Gaussian white noise signal and iteratively refines the signal via a gradient-based sampler conditioned + on the mel-spectrogram. WaveGrad offers a natural way to trade inference speed for sample quality by adjusting + the number of refinement steps, and bridges the gap between non-autoregressive and autoregressive models in + terms of audio quality. We find that it can generate high fidelity audio samples using as few as six iterations. + Experiments reveal WaveGrad to generate high fidelity audio, outperforming adversarial non-autoregressive + baselines and matching a strong likelihood-based autoregressive baseline using fewer sequential operations. + Audio samples are available at this https URL. + """ + # pylint: disable=dangerous-default-value - def __init__( - self, - in_channels=80, - out_channels=1, - use_weight_norm=False, - y_conv_channels=32, - x_conv_channels=768, - dblock_out_channels=[128, 128, 256, 512], - ublock_out_channels=[512, 512, 256, 128, 128], - upsample_factors=[5, 5, 3, 2, 2], - upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], - ): + def __init__(self, config: Coqpit): super().__init__() - - self.use_weight_norm = use_weight_norm - self.hop_len = np.prod(upsample_factors) + self.config = config + self.use_weight_norm = config.model_params.use_weight_norm + self.hop_len = np.prod(config.model_params.upsample_factors) self.noise_level = None self.num_steps = None self.beta = None self.alpha = None self.alpha_hat = None - self.noise_level = None self.c1 = None self.c2 = None self.sigma = None # dblocks - self.y_conv = Conv1d(1, y_conv_channels, 5, padding=2) + self.y_conv = Conv1d(1, config.model_params.y_conv_channels, 5, padding=2) self.dblocks = nn.ModuleList([]) - ic = y_conv_channels - for oc, df in zip(dblock_out_channels, reversed(upsample_factors)): + ic = config.model_params.y_conv_channels + for oc, df in zip(config.model_params.dblock_out_channels, reversed(config.model_params.upsample_factors)): self.dblocks.append(DBlock(ic, oc, df)) ic = oc # film self.film = nn.ModuleList([]) - ic = y_conv_channels - for oc in reversed(ublock_out_channels): + ic = config.model_params.y_conv_channels + for oc in reversed(config.model_params.ublock_out_channels): self.film.append(FiLM(ic, oc)) ic = oc - # ublocks + # ublocksn self.ublocks = nn.ModuleList([]) - ic = x_conv_channels - for oc, uf, ud in zip(ublock_out_channels, upsample_factors, upsample_dilations): + ic = config.model_params.x_conv_channels + for oc, uf, ud in zip( + config.model_params.ublock_out_channels, + config.model_params.upsample_factors, + config.model_params.upsample_dilations, + ): self.ublocks.append(UBlock(ic, oc, uf, ud)) ic = oc - self.x_conv = Conv1d(in_channels, x_conv_channels, 3, padding=1) - self.out_conv = Conv1d(oc, out_channels, 3, padding=1) + self.x_conv = Conv1d(config.model_params.in_channels, config.model_params.x_conv_channels, 3, padding=1) + self.out_conv = Conv1d(oc, config.model_params.out_channels, 3, padding=1) - if use_weight_norm: + if config.model_params.use_weight_norm: self.apply_weight_norm() def forward(self, x, spectrogram, noise_scale): @@ -180,7 +220,7 @@ class Wavegrad(nn.Module): if eval: self.eval() assert not self.training - if self.use_weight_norm: + if self.config.model_params.use_weight_norm: self.remove_weight_norm() betas = np.linspace( config["test_noise_schedule"]["min_val"], @@ -195,3 +235,93 @@ class Wavegrad(nn.Module): config["train_noise_schedule"]["num_steps"], ) self.compute_noise_level(betas) + + def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: + # format data + x = batch["input"] + y = batch["waveform"] + + # set noise scale + noise, x_noisy, noise_scale = self.compute_y_n(y) + + # forward pass + noise_hat = self.forward(x_noisy, x, noise_scale) + + # compute losses + loss = criterion(noise, noise_hat) + return {"model_output": noise_hat}, {"loss": loss} + + def train_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + return None, None + + @torch.no_grad() + def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: + return self.train_step(batch, criterion) + + def eval_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + return None, None + + def test_run(self, ap: AudioProcessor, samples: List[Dict], ouputs: Dict): # pylint: disable=unused-argument + # setup noise schedule and inference + noise_schedule = self.config["test_noise_schedule"] + betas = np.linspace(noise_schedule["min_val"], noise_schedule["max_val"], noise_schedule["num_steps"]) + self.compute_noise_level(betas) + for sample in samples: + x = sample["input"] + y = sample["waveform"] + # compute voice + y_pred = self.inference(x) + # compute spectrograms + figures = plot_results(y_pred, y, ap, "test") + # Sample audio + sample_voice = y_pred[0].squeeze(0).detach().cpu().numpy() + return figures, {"test/audio": sample_voice} + + def get_optimizer(self): + return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self) + + def get_scheduler(self, optimizer): + return get_scheduler(self.config.lr_scheduler, self.config.lr_scheduler_params, optimizer) + + def get_criterion(self): + return torch.nn.L1Loss() + + @staticmethod + def format_batch(batch: Dict) -> Dict: + # return a whole audio segment + m, y = batch[0], batch[1] + y = y.unsqueeze(1) + return {"input": m, "waveform": y} + + def get_data_loader( + self, config: Coqpit, ap: AudioProcessor, is_eval: True, data_items: List, verbose: bool, num_gpus: int + ): + dataset = WaveGradDataset( + ap=ap, + items=data_items, + seq_len=self.config.seq_len, + hop_len=ap.hop_length, + pad_short=self.config.pad_short, + conv_pad=self.config.conv_pad, + is_training=not is_eval, + return_segments=True, + use_noise_augment=False, + use_cache=config.use_cache, + verbose=verbose, + ) + sampler = DistributedSampler(dataset) if num_gpus > 1 else None + loader = DataLoader( + dataset, + batch_size=self.config.batch_size, + shuffle=num_gpus <= 1, + drop_last=False, + sampler=sampler, + num_workers=self.config.num_eval_loader_workers if is_eval else self.config.num_loader_workers, + pin_memory=False, + ) + return loader + + def on_epoch_start(self, trainer): # pylint: disable=unused-argument + noise_schedule = self.config["train_noise_schedule"] + betas = np.linspace(noise_schedule["min_val"], noise_schedule["max_val"], noise_schedule["num_steps"]) + self.compute_noise_level(betas) diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 04040931..a5d89d5a 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -1,13 +1,21 @@ import sys import time +from dataclasses import dataclass, field +from typing import Dict, List, Tuple import numpy as np import torch import torch.nn as nn import torch.nn.functional as F +from coqpit import Coqpit +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler -# fix this -from TTS.utils.audio import AudioProcessor as ap +from TTS.tts.utils.visual import plot_spectrogram +from TTS.utils.audio import AudioProcessor +from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset +from TTS.vocoder.layers.losses import WaveRNNLoss +from TTS.vocoder.models.base_vocoder import BaseVocoder from TTS.vocoder.utils.distribution import sample_from_discretized_mix_logistic, sample_from_gaussian @@ -135,89 +143,145 @@ class Upsample(nn.Module): return m.transpose(1, 2), aux -class WaveRNN(nn.Module): - def __init__( - self, - rnn_dims, - fc_dims, - mode, - mulaw, - pad, - use_aux_net, - use_upsample_net, - upsample_factors, - feat_dims, - compute_dims, - res_out_dims, - num_res_blocks, - hop_length, - sample_rate, - ): +@dataclass +class WavernnArgs(Coqpit): + """🐸 WaveRNN model arguments. + + rnn_dims (int): + Number of hidden channels in RNN layers. Defaults to 512. + fc_dims (int): + Number of hidden channels in fully-conntected layers. Defaults to 512. + compute_dims (int): + Number of hidden channels in the feature ResNet. Defaults to 128. + res_out_dim (int): + Number of hidden channels in the feature ResNet output. Defaults to 128. + num_res_blocks (int): + Number of residual blocks in the ResNet. Defaults to 10. + use_aux_net (bool): + enable/disable the feature ResNet. Defaults to True. + use_upsample_net (bool): + enable/ disable the upsampling networl. If False, basic upsampling is used. Defaults to True. + upsample_factors (list): + Upsampling factors. The multiply of the values must match the `hop_length`. Defaults to ```[4, 8, 8]```. + mode (str): + Output mode of the WaveRNN vocoder. `mold` for Mixture of Logistic Distribution, `gauss` for a single + Gaussian Distribution and `bits` for quantized bits as the model's output. + mulaw (bool): + enable / disable the use of Mulaw quantization for training. Only applicable if `mode == 'bits'`. Defaults + to `True`. + pad (int): + Padding applied to the input feature frames against the convolution layers of the feature network. + Defaults to 2. + """ + + rnn_dims: int = 512 + fc_dims: int = 512 + compute_dims: int = 128 + res_out_dims: int = 128 + num_res_blocks: int = 10 + use_aux_net: bool = True + use_upsample_net: bool = True + upsample_factors: List[int] = field(default_factory=lambda: [4, 8, 8]) + mode: str = "mold" # mold [string], gauss [string], bits [int] + mulaw: bool = True # apply mulaw if mode is bits + pad: int = 2 + feat_dims: int = 80 + + +class Wavernn(BaseVocoder): + def __init__(self, config: Coqpit): + """🐸 WaveRNN model. + Original paper - https://arxiv.org/abs/1802.08435 + Official implementation - https://github.com/fatchord/WaveRNN + + Args: + config (Coqpit): [description] + + Raises: + RuntimeError: [description] + + Examples: + >>> from TTS.vocoder.configs import WavernnConfig + >>> config = WavernnConfig() + >>> model = Wavernn(config) + + Paper Abstract: + Sequential models achieve state-of-the-art results in audio, visual and textual domains with respect to + both estimating the data distribution and generating high-quality samples. Efficient sampling for this + class of models has however remained an elusive problem. With a focus on text-to-speech synthesis, we + describe a set of general techniques for reducing sampling time while maintaining high output quality. + We first describe a single-layer recurrent neural network, the WaveRNN, with a dual softmax layer that + matches the quality of the state-of-the-art WaveNet model. The compact form of the network makes it + possible to generate 24kHz 16-bit audio 4x faster than real time on a GPU. Second, we apply a weight + pruning technique to reduce the number of weights in the WaveRNN. We find that, for a constant number of + parameters, large sparse networks perform better than small dense networks and this relationship holds for + sparsity levels beyond 96%. The small number of weights in a Sparse WaveRNN makes it possible to sample + high-fidelity audio on a mobile CPU in real time. Finally, we propose a new generation scheme based on + subscaling that folds a long sequence into a batch of shorter sequences and allows one to generate multiple + samples at once. The Subscale WaveRNN produces 16 samples per step without loss of quality and offers an + orthogonal method for increasing sampling efficiency. + """ super().__init__() - self.mode = mode - self.mulaw = mulaw - self.pad = pad - self.use_upsample_net = use_upsample_net - self.use_aux_net = use_aux_net - if isinstance(self.mode, int): - self.n_classes = 2 ** self.mode - elif self.mode == "mold": + + self.args = config.model_params + self.config = config + + if isinstance(self.args.mode, int): + self.n_classes = 2 ** self.args.mode + elif self.args.mode == "mold": self.n_classes = 3 * 10 - elif self.mode == "gauss": + elif self.args.mode == "gauss": self.n_classes = 2 else: - raise RuntimeError("Unknown model mode value - ", self.mode) + raise RuntimeError("Unknown model mode value - ", self.args.mode) - self.rnn_dims = rnn_dims - self.aux_dims = res_out_dims // 4 - self.hop_length = hop_length - self.sample_rate = sample_rate + self.aux_dims = self.args.res_out_dims // 4 - if self.use_upsample_net: + if self.args.use_upsample_net: assert ( - np.cumproduct(upsample_factors)[-1] == self.hop_length + np.cumproduct(self.args.upsample_factors)[-1] == config.audio.hop_length ), " [!] upsample scales needs to be equal to hop_length" self.upsample = UpsampleNetwork( - feat_dims, - upsample_factors, - compute_dims, - num_res_blocks, - res_out_dims, - pad, - use_aux_net, + self.args.feat_dims, + self.args.upsample_factors, + self.args.compute_dims, + self.args.num_res_blocks, + self.args.res_out_dims, + self.args.pad, + self.args.use_aux_net, ) else: self.upsample = Upsample( - hop_length, - pad, - num_res_blocks, - feat_dims, - compute_dims, - res_out_dims, - use_aux_net, + config.audio.hop_length, + self.args.pad, + self.args.num_res_blocks, + self.args.feat_dims, + self.args.compute_dims, + self.args.res_out_dims, + self.args.use_aux_net, ) - if self.use_aux_net: - self.I = nn.Linear(feat_dims + self.aux_dims + 1, rnn_dims) - self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True) - self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims, batch_first=True) - self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims) - self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims) - self.fc3 = nn.Linear(fc_dims, self.n_classes) + if self.args.use_aux_net: + self.I = nn.Linear(self.args.feat_dims + self.aux_dims + 1, self.args.rnn_dims) + self.rnn1 = nn.GRU(self.args.rnn_dims, self.args.rnn_dims, batch_first=True) + self.rnn2 = nn.GRU(self.args.rnn_dims + self.aux_dims, self.args.rnn_dims, batch_first=True) + self.fc1 = nn.Linear(self.args.rnn_dims + self.aux_dims, self.args.fc_dims) + self.fc2 = nn.Linear(self.args.fc_dims + self.aux_dims, self.args.fc_dims) + self.fc3 = nn.Linear(self.args.fc_dims, self.n_classes) else: - self.I = nn.Linear(feat_dims + 1, rnn_dims) - self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True) - self.rnn2 = nn.GRU(rnn_dims, rnn_dims, batch_first=True) - self.fc1 = nn.Linear(rnn_dims, fc_dims) - self.fc2 = nn.Linear(fc_dims, fc_dims) - self.fc3 = nn.Linear(fc_dims, self.n_classes) + self.I = nn.Linear(self.args.feat_dims + 1, self.args.rnn_dims) + self.rnn1 = nn.GRU(self.args.rnn_dims, self.args.rnn_dims, batch_first=True) + self.rnn2 = nn.GRU(self.args.rnn_dims, self.args.rnn_dims, batch_first=True) + self.fc1 = nn.Linear(self.args.rnn_dims, self.args.fc_dims) + self.fc2 = nn.Linear(self.args.fc_dims, self.args.fc_dims) + self.fc3 = nn.Linear(self.args.fc_dims, self.n_classes) def forward(self, x, mels): bsize = x.size(0) - h1 = torch.zeros(1, bsize, self.rnn_dims).to(x.device) - h2 = torch.zeros(1, bsize, self.rnn_dims).to(x.device) + h1 = torch.zeros(1, bsize, self.args.rnn_dims).to(x.device) + h2 = torch.zeros(1, bsize, self.args.rnn_dims).to(x.device) mels, aux = self.upsample(mels) - if self.use_aux_net: + if self.args.use_aux_net: aux_idx = [self.aux_dims * i for i in range(5)] a1 = aux[:, :, aux_idx[0] : aux_idx[1]] a2 = aux[:, :, aux_idx[1] : aux_idx[2]] @@ -226,7 +290,7 @@ class WaveRNN(nn.Module): x = ( torch.cat([x.unsqueeze(-1), mels, a1], dim=2) - if self.use_aux_net + if self.args.use_aux_net else torch.cat([x.unsqueeze(-1), mels], dim=2) ) x = self.I(x) @@ -236,15 +300,15 @@ class WaveRNN(nn.Module): x = x + res res = x - x = torch.cat([x, a2], dim=2) if self.use_aux_net else x + x = torch.cat([x, a2], dim=2) if self.args.use_aux_net else x self.rnn2.flatten_parameters() x, _ = self.rnn2(x, h2) x = x + res - x = torch.cat([x, a3], dim=2) if self.use_aux_net else x + x = torch.cat([x, a3], dim=2) if self.args.use_aux_net else x x = F.relu(self.fc1(x)) - x = torch.cat([x, a4], dim=2) if self.use_aux_net else x + x = torch.cat([x, a4], dim=2) if self.args.use_aux_net else x x = F.relu(self.fc2(x)) return self.fc3(x) @@ -262,9 +326,9 @@ class WaveRNN(nn.Module): if mels.ndim == 2: mels = mels.unsqueeze(0) - wave_len = (mels.size(-1) - 1) * self.hop_length + wave_len = (mels.size(-1) - 1) * self.config.audio.hop_length - mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side="both") + mels = self.pad_tensor(mels.transpose(1, 2), pad=self.args.pad, side="both") mels, aux = self.upsample(mels.transpose(1, 2)) if batched: @@ -274,11 +338,11 @@ class WaveRNN(nn.Module): b_size, seq_len, _ = mels.size() - h1 = torch.zeros(b_size, self.rnn_dims).type_as(mels) - h2 = torch.zeros(b_size, self.rnn_dims).type_as(mels) + h1 = torch.zeros(b_size, self.args.rnn_dims).type_as(mels) + h2 = torch.zeros(b_size, self.args.rnn_dims).type_as(mels) x = torch.zeros(b_size, 1).type_as(mels) - if self.use_aux_net: + if self.args.use_aux_net: d = self.aux_dims aux_split = [aux[:, :, d * i : d * (i + 1)] for i in range(4)] @@ -286,35 +350,35 @@ class WaveRNN(nn.Module): m_t = mels[:, i, :] - if self.use_aux_net: + if self.args.use_aux_net: a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split) - x = torch.cat([x, m_t, a1_t], dim=1) if self.use_aux_net else torch.cat([x, m_t], dim=1) + x = torch.cat([x, m_t, a1_t], dim=1) if self.args.use_aux_net else torch.cat([x, m_t], dim=1) x = self.I(x) h1 = rnn1(x, h1) x = x + h1 - inp = torch.cat([x, a2_t], dim=1) if self.use_aux_net else x + inp = torch.cat([x, a2_t], dim=1) if self.args.use_aux_net else x h2 = rnn2(inp, h2) x = x + h2 - x = torch.cat([x, a3_t], dim=1) if self.use_aux_net else x + x = torch.cat([x, a3_t], dim=1) if self.args.use_aux_net else x x = F.relu(self.fc1(x)) - x = torch.cat([x, a4_t], dim=1) if self.use_aux_net else x + x = torch.cat([x, a4_t], dim=1) if self.args.use_aux_net else x x = F.relu(self.fc2(x)) logits = self.fc3(x) - if self.mode == "mold": + if self.args.mode == "mold": sample = sample_from_discretized_mix_logistic(logits.unsqueeze(0).transpose(1, 2)) output.append(sample.view(-1)) x = sample.transpose(0, 1).type_as(mels) - elif self.mode == "gauss": + elif self.args.mode == "gauss": sample = sample_from_gaussian(logits.unsqueeze(0).transpose(1, 2)) output.append(sample.view(-1)) x = sample.transpose(0, 1).type_as(mels) - elif isinstance(self.mode, int): + elif isinstance(self.args.mode, int): posterior = F.softmax(logits, dim=1) distrib = torch.distributions.Categorical(posterior) @@ -322,7 +386,7 @@ class WaveRNN(nn.Module): output.append(sample) x = sample.unsqueeze(-1) else: - raise RuntimeError("Unknown model mode value - ", self.mode) + raise RuntimeError("Unknown model mode value - ", self.args.mode) if i % 100 == 0: self.gen_display(i, seq_len, b_size, start) @@ -337,22 +401,22 @@ class WaveRNN(nn.Module): else: output = output[0] - if self.mulaw and isinstance(self.mode, int): - output = ap.mulaw_decode(output, self.mode) + if self.args.mulaw and isinstance(self.args.mode, int): + output = AudioProcessor.mulaw_decode(output, self.args.mode) # Fade-out at the end to avoid signal cutting out suddenly - fade_out = np.linspace(1, 0, 20 * self.hop_length) + fade_out = np.linspace(1, 0, 20 * self.config.audio.hop_length) output = output[:wave_len] if wave_len > len(fade_out): - output[-20 * self.hop_length :] *= fade_out + output[-20 * self.config.audio.hop_length :] *= fade_out self.train() return output def gen_display(self, i, seq_len, b_size, start): gen_rate = (i + 1) / (time.time() - start) * b_size / 1000 - realtime_ratio = gen_rate * 1000 / self.sample_rate + realtime_ratio = gen_rate * 1000 / self.config.audio.sample_rate stream( "%i/%i -- batch_size: %i -- gen_rate: %.1f kHz -- x_realtime: %.1f ", (i * b_size, seq_len * b_size, b_size, gen_rate, realtime_ratio), @@ -486,3 +550,83 @@ class WaveRNN(nn.Module): if eval: self.eval() assert not self.training + + def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: + mels = batch["input"] + waveform = batch["waveform"] + waveform_coarse = batch["waveform_coarse"] + + y_hat = self.forward(waveform, mels) + if isinstance(self.args.mode, int): + y_hat = y_hat.transpose(1, 2).unsqueeze(-1) + else: + waveform_coarse = waveform_coarse.float() + waveform_coarse = waveform_coarse.unsqueeze(-1) + # compute losses + loss_dict = criterion(y_hat, waveform_coarse) + return {"model_output": y_hat}, loss_dict + + def eval_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: + return self.train_step(batch, criterion) + + @torch.no_grad() + def test_run( + self, ap: AudioProcessor, samples: List[Dict], output: Dict # pylint: disable=unused-argument + ) -> Tuple[Dict, Dict]: + figures = {} + audios = {} + for idx, sample in enumerate(samples): + x = sample["input"] + y_hat = self.inference(x, self.config.batched, self.config.target_samples, self.config.overlap_samples) + x_hat = ap.melspectrogram(y_hat) + figures.update( + { + f"test_{idx}/ground_truth": plot_spectrogram(x.T), + f"test_{idx}/prediction": plot_spectrogram(x_hat.T), + } + ) + audios.update({f"test_{idx}/audio", y_hat}) + return figures, audios + + @staticmethod + def format_batch(batch: Dict) -> Dict: + waveform = batch[0] + mels = batch[1] + waveform_coarse = batch[2] + return {"input": mels, "waveform": waveform, "waveform_coarse": waveform_coarse} + + def get_data_loader( # pylint: disable=no-self-use + self, + config: Coqpit, + ap: AudioProcessor, + is_eval: True, + data_items: List, + verbose: bool, + num_gpus: int, + ): + dataset = WaveRNNDataset( + ap=ap, + items=data_items, + seq_len=config.seq_len, + hop_len=ap.hop_length, + pad=config.model_params.pad, + mode=config.model_params.mode, + mulaw=config.model_params.mulaw, + is_training=not is_eval, + verbose=verbose, + ) + sampler = DistributedSampler(dataset, shuffle=True) if num_gpus > 1 else None + loader = DataLoader( + dataset, + batch_size=1 if is_eval else config.batch_size, + shuffle=num_gpus == 0, + collate_fn=dataset.collate, + sampler=sampler, + num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers, + pin_memory=True, + ) + return loader + + def get_criterion(self): + # define train functions + return WaveRNNLoss(self.args.mode) From 59abf490a1dae294560030e0fb3c7bf85ba01028 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:38:01 +0200 Subject: [PATCH 089/258] Implement `setup_model` for vocoder models --- TTS/vocoder/models/__init__.py | 147 +++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) diff --git a/TTS/vocoder/models/__init__.py b/TTS/vocoder/models/__init__.py index e69de29b..cbd3950b 100644 --- a/TTS/vocoder/models/__init__.py +++ b/TTS/vocoder/models/__init__.py @@ -0,0 +1,147 @@ +import importlib +import re + +from coqpit import Coqpit + + +def to_camel(text): + text = text.capitalize() + return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) + + +def setup_model(config: Coqpit): + """Load models directly from configuration.""" + print(" > Vocoder Model: {}".format(config.model)) + if "discriminator_model" in config and "generator_model" in config: + MyModel = importlib.import_module("TTS.vocoder.models.gan") + MyModel = getattr(MyModel, "GAN") + else: + MyModel = importlib.import_module("TTS.vocoder.models." + config.model.lower()) + if config.model.lower() == "wavernn": + MyModel = getattr(MyModel, "Wavernn") + elif config.model.lower() == "gan": + MyModel = getattr(MyModel, "GAN") + elif config.model.lower() == "wavegrad": + MyModel = getattr(MyModel, "Wavegrad") + else: + MyModel = getattr(MyModel, to_camel(config.model)) + raise ValueError(f"Model {config.model} not exist!") + model = MyModel(config) + return model + + +def setup_generator(c): + print(" > Generator Model: {}".format(c.generator_model)) + MyModel = importlib.import_module("TTS.vocoder.models." + c.generator_model.lower()) + MyModel = getattr(MyModel, to_camel(c.generator_model)) + # this is to preserve the Wavernn class name (instead of Wavernn) + if c.generator_model.lower() in "hifigan_generator": + model = MyModel(in_channels=c.audio["num_mels"], out_channels=1, **c.generator_model_params) + elif c.generator_model.lower() in "melgan_generator": + model = MyModel( + in_channels=c.audio["num_mels"], + out_channels=1, + proj_kernel=7, + base_channels=512, + upsample_factors=c.generator_model_params["upsample_factors"], + res_kernel=3, + num_res_blocks=c.generator_model_params["num_res_blocks"], + ) + elif c.generator_model in "melgan_fb_generator": + raise ValueError("melgan_fb_generator is now fullband_melgan_generator") + elif c.generator_model.lower() in "multiband_melgan_generator": + model = MyModel( + in_channels=c.audio["num_mels"], + out_channels=4, + proj_kernel=7, + base_channels=384, + upsample_factors=c.generator_model_params["upsample_factors"], + res_kernel=3, + num_res_blocks=c.generator_model_params["num_res_blocks"], + ) + elif c.generator_model.lower() in "fullband_melgan_generator": + model = MyModel( + in_channels=c.audio["num_mels"], + out_channels=1, + proj_kernel=7, + base_channels=512, + upsample_factors=c.generator_model_params["upsample_factors"], + res_kernel=3, + num_res_blocks=c.generator_model_params["num_res_blocks"], + ) + elif c.generator_model.lower() in "parallel_wavegan_generator": + model = MyModel( + in_channels=1, + out_channels=1, + kernel_size=3, + num_res_blocks=c.generator_model_params["num_res_blocks"], + stacks=c.generator_model_params["stacks"], + res_channels=64, + gate_channels=128, + skip_channels=64, + aux_channels=c.audio["num_mels"], + dropout=0.0, + bias=True, + use_weight_norm=True, + upsample_factors=c.generator_model_params["upsample_factors"], + ) + else: + raise NotImplementedError(f"Model {c.generator_model} not implemented!") + return model + + +def setup_discriminator(c): + print(" > Discriminator Model: {}".format(c.discriminator_model)) + if "parallel_wavegan" in c.discriminator_model: + MyModel = importlib.import_module("TTS.vocoder.models.parallel_wavegan_discriminator") + else: + MyModel = importlib.import_module("TTS.vocoder.models." + c.discriminator_model.lower()) + MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower())) + if c.discriminator_model in "hifigan_discriminator": + model = MyModel() + if c.discriminator_model in "random_window_discriminator": + model = MyModel( + cond_channels=c.audio["num_mels"], + hop_length=c.audio["hop_length"], + uncond_disc_donwsample_factors=c.discriminator_model_params["uncond_disc_donwsample_factors"], + cond_disc_downsample_factors=c.discriminator_model_params["cond_disc_downsample_factors"], + cond_disc_out_channels=c.discriminator_model_params["cond_disc_out_channels"], + window_sizes=c.discriminator_model_params["window_sizes"], + ) + if c.discriminator_model in "melgan_multiscale_discriminator": + model = MyModel( + in_channels=1, + out_channels=1, + kernel_sizes=(5, 3), + base_channels=c.discriminator_model_params["base_channels"], + max_channels=c.discriminator_model_params["max_channels"], + downsample_factors=c.discriminator_model_params["downsample_factors"], + ) + if c.discriminator_model == "residual_parallel_wavegan_discriminator": + model = MyModel( + in_channels=1, + out_channels=1, + kernel_size=3, + num_layers=c.discriminator_model_params["num_layers"], + stacks=c.discriminator_model_params["stacks"], + res_channels=64, + gate_channels=128, + skip_channels=64, + dropout=0.0, + bias=True, + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}, + ) + if c.discriminator_model == "parallel_wavegan_discriminator": + model = MyModel( + in_channels=1, + out_channels=1, + kernel_size=3, + num_layers=c.discriminator_model_params["num_layers"], + conv_channels=64, + dilation_factor=1, + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}, + bias=True, + ) + return model From aed919cf1c895c51bdd10fc02a1388e909286e0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:42:36 +0200 Subject: [PATCH 090/258] Update `vocoder` datasets and `setup_dataset` --- TTS/vocoder/datasets/__init__.py | 57 ++++++++++++++++++++++++ TTS/vocoder/datasets/preprocess.py | 17 ++++++- TTS/vocoder/datasets/wavegrad_dataset.py | 2 +- TTS/vocoder/datasets/wavernn_dataset.py | 27 ++++++----- 4 files changed, 89 insertions(+), 14 deletions(-) diff --git a/TTS/vocoder/datasets/__init__.py b/TTS/vocoder/datasets/__init__.py index e69de29b..86b059c3 100644 --- a/TTS/vocoder/datasets/__init__.py +++ b/TTS/vocoder/datasets/__init__.py @@ -0,0 +1,57 @@ +from typing import List + +from coqpit import Coqpit +from torch.utils.data import Dataset + +from TTS.utils.audio import AudioProcessor +from TTS.vocoder.datasets.gan_dataset import GANDataset +from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset +from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset + + +def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: List, verbose: bool) -> Dataset: + if config.model.lower() in "gan": + dataset = GANDataset( + ap=ap, + items=data_items, + seq_len=config.seq_len, + hop_len=ap.hop_length, + pad_short=config.pad_short, + conv_pad=config.conv_pad, + return_pairs=config.diff_samples_for_G_and_D if "diff_samples_for_G_and_D" in config else False, + is_training=not is_eval, + return_segments=not is_eval, + use_noise_augment=config.use_noise_augment, + use_cache=config.use_cache, + verbose=verbose, + ) + dataset.shuffle_mapping() + elif config.model.lower() == "wavegrad": + dataset = WaveGradDataset( + ap=ap, + items=data_items, + seq_len=config.seq_len, + hop_len=ap.hop_length, + pad_short=config.pad_short, + conv_pad=config.conv_pad, + is_training=not is_eval, + return_segments=True, + use_noise_augment=False, + use_cache=config.use_cache, + verbose=verbose, + ) + elif config.model.lower() == "wavernn": + dataset = WaveRNNDataset( + ap=ap, + items=data_items, + seq_len=config.seq_len, + hop_len=ap.hop_length, + pad=config.model_params.pad, + mode=config.model_params.mode, + mulaw=config.model_params.mulaw, + is_training=not is_eval, + verbose=verbose, + ) + else: + raise ValueError(f" [!] Dataset for model {config.model.lower()} cannot be found.") + return dataset diff --git a/TTS/vocoder/datasets/preprocess.py b/TTS/vocoder/datasets/preprocess.py index d99ee147..c4569b3d 100644 --- a/TTS/vocoder/datasets/preprocess.py +++ b/TTS/vocoder/datasets/preprocess.py @@ -3,10 +3,21 @@ import os from pathlib import Path import numpy as np +from coqpit import Coqpit from tqdm import tqdm +from TTS.utils.audio import AudioProcessor -def preprocess_wav_files(out_path, config, ap): + +def preprocess_wav_files(out_path: str, config: Coqpit, ap: AudioProcessor): + """Process wav and compute mel and quantized wave signal. + It is mainly used by WaveRNN dataloader. + + Args: + out_path (str): Parent folder path to save the files. + config (Coqpit): Model config. + ap (AudioProcessor): Audio processor. + """ os.makedirs(os.path.join(out_path, "quant"), exist_ok=True) os.makedirs(os.path.join(out_path, "mel"), exist_ok=True) wav_files = find_wav_files(config.data_path) @@ -18,7 +29,9 @@ def preprocess_wav_files(out_path, config, ap): mel = ap.melspectrogram(y) np.save(mel_path, mel) if isinstance(config.mode, int): - quant = ap.mulaw_encode(y, qc=config.mode) if config.mulaw else ap.quantize(y, bits=config.mode) + quant = ( + ap.mulaw_encode(y, qc=config.mode) if config.model_params.mulaw else ap.quantize(y, bits=config.mode) + ) np.save(quant_path, quant) diff --git a/TTS/vocoder/datasets/wavegrad_dataset.py b/TTS/vocoder/datasets/wavegrad_dataset.py index c0d24e84..d99fc417 100644 --- a/TTS/vocoder/datasets/wavegrad_dataset.py +++ b/TTS/vocoder/datasets/wavegrad_dataset.py @@ -136,4 +136,4 @@ class WaveGradDataset(Dataset): mels[idx, :, : mel.shape[1]] = mel audios[idx, : audio.shape[0]] = audio - return mels, audios + return audios, mels diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 1596ea8f..d648b68c 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -10,16 +10,7 @@ class WaveRNNDataset(Dataset): """ def __init__( - self, - ap, - items, - seq_len, - hop_len, - pad, - mode, - mulaw, - is_training=True, - verbose=False, + self, ap, items, seq_len, hop_len, pad, mode, mulaw, is_training=True, verbose=False, return_segments=True ): super().__init__() @@ -34,6 +25,7 @@ class WaveRNNDataset(Dataset): self.mulaw = mulaw self.is_training = is_training self.verbose = verbose + self.return_segments = return_segments assert self.seq_len % self.hop_len == 0 @@ -44,6 +36,16 @@ class WaveRNNDataset(Dataset): item = self.load_item(index) return item + def load_test_samples(self, num_samples): + samples = [] + return_segments = self.return_segments + self.return_segments = False + for idx in range(num_samples): + mel, audio, _ = self.load_item(idx) + samples.append([mel, audio]) + self.return_segments = return_segments + return samples + def load_item(self, index): """ load (audio, feat) couple if feature_path is set @@ -53,7 +55,10 @@ class WaveRNNDataset(Dataset): wavpath = self.item_list[index] audio = self.ap.load_wav(wavpath) - min_audio_len = 2 * self.seq_len + (2 * self.pad * self.hop_len) + if self.return_segments: + min_audio_len = 2 * self.seq_len + (2 * self.pad * self.hop_len) + else: + min_audio_len = audio.shape[0] + (2 * self.pad * self.hop_len) if audio.shape[0] < min_audio_len: print(" [!] Instance is too short! : {}".format(wavpath)) audio = np.pad(audio, [0, min_audio_len - audio.shape[0] + self.hop_len]) From f3ff5b19712eb88d09a03d0a896e8f9976e770f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:44:02 +0200 Subject: [PATCH 091/258] Update `TTS.bin` scripts for the new API --- TTS/bin/compute_attention_masks.py | 2 +- TTS/bin/compute_statistics.py | 2 +- TTS/bin/convert_tacotron2_torch_to_tf.py | 6 +++--- TTS/bin/distribute.py | 21 +++++---------------- TTS/bin/extract_tts_spectrograms.py | 16 ++++------------ 5 files changed, 14 insertions(+), 33 deletions(-) diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index eb708040..35721f59 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -75,7 +75,7 @@ Example run: # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) # TODO: handle multi-speaker - model = setup_model(num_chars, num_speakers=0, c=C) + model = setup_model(C) model, _ = load_checkpoint(model, args.model_path, None, args.use_cuda) model.eval() diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index 25e3fce5..6179dafc 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -77,7 +77,7 @@ def main(): print(f" > Avg mel spec mean: {mel_mean.mean()}") print(f" > Avg mel spec scale: {mel_scale.mean()}") print(f" > Avg linear spec mean: {linear_mean.mean()}") - print(f" > Avg lienar spec scale: {linear_scale.mean()}") + print(f" > Avg linear spec scale: {linear_scale.mean()}") # set default config values for mean-var scaling CONFIG.audio.stats_path = output_file_path diff --git a/TTS/bin/convert_tacotron2_torch_to_tf.py b/TTS/bin/convert_tacotron2_torch_to_tf.py index 119529ae..a6fb5d9b 100644 --- a/TTS/bin/convert_tacotron2_torch_to_tf.py +++ b/TTS/bin/convert_tacotron2_torch_to_tf.py @@ -31,18 +31,18 @@ c = load_config(config_path) num_speakers = 0 # init torch model -num_chars = len(phonemes) if c.use_phonemes else len(symbols) -model = setup_model(num_chars, num_speakers, c) +model = setup_model(c) checkpoint = torch.load(args.torch_model_path, map_location=torch.device("cpu")) state_dict = checkpoint["model"] model.load_state_dict(state_dict) # init tf model +num_chars = len(phonemes) if c.use_phonemes else len(symbols) model_tf = Tacotron2( num_chars=num_chars, num_speakers=num_speakers, r=model.decoder.r, - postnet_output_dim=c.audio["num_mels"], + out_channels=c.audio["num_mels"], decoder_output_dim=c.audio["num_mels"], attn_type=c.attention_type, attn_win=c.windowing, diff --git a/TTS/bin/distribute.py b/TTS/bin/distribute.py index 20d4bb20..873ddb1f 100644 --- a/TTS/bin/distribute.py +++ b/TTS/bin/distribute.py @@ -1,36 +1,24 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -import argparse import os import pathlib import subprocess -import sys import time import torch +from TTS.trainer import TrainingArgs + def main(): """ Call train.py as a new process and pass command arguments """ - parser = argparse.ArgumentParser() + parser = TrainingArgs().init_argparse(arg_prefix="") parser.add_argument("--script", type=str, help="Target training script to distibute.") - parser.add_argument( - "--continue_path", - type=str, - help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default="", - required="--config_path" not in sys.argv, - ) - parser.add_argument( - "--restore_path", type=str, help="Model file to be restored. Use to finetune a model.", default="" - ) - parser.add_argument( - "--config_path", type=str, help="Path to config file for training.", required="--continue_path" not in sys.argv - ) args, unargs = parser.parse_known_args() + breakpoint() num_gpus = torch.cuda.device_count() group_id = time.strftime("%Y_%m_%d-%H%M%S") @@ -51,6 +39,7 @@ def main(): my_env = os.environ.copy() my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i) command[-1] = "--rank={}".format(i) + # prevent stdout for processes with rank != 0 stdout = None if i == 0 else open(os.devnull, "w") p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env) # pylint: disable=consider-using-with processes.append(p) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index c5ba1b2a..11cdfe31 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -14,7 +14,6 @@ from TTS.tts.datasets import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.models import setup_model from TTS.tts.utils.speakers import get_speaker_manager -from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import count_parameters @@ -40,9 +39,7 @@ def setup_loader(ap, r, verbose=False): use_noise_augment=False, verbose=verbose, speaker_id_mapping=speaker_manager.speaker_ids, - d_vector_mapping=speaker_manager.d_vectors - if c.use_speaker_embedding and c.use_external_speaker_embedding_file - else None, + d_vector_mapping=speaker_manager.d_vectors if c.use_speaker_embedding and c.use_d_vector_file else None, ) if c.use_phonemes and c.compute_input_seq_cache: @@ -224,16 +221,10 @@ def extract_spectrograms( def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined - global meta_data, symbols, phonemes, model_characters, speaker_manager + global meta_data, speaker_manager # Audio processor ap = AudioProcessor(**c.audio) - if "characters" in c.keys() and c["characters"]: - symbols, phonemes = make_symbols(**c.characters) - - # set model characters - model_characters = phonemes if c.use_phonemes else symbols - num_chars = len(model_characters) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) @@ -245,7 +236,7 @@ def main(args): # pylint: disable=redefined-outer-name speaker_manager = get_speaker_manager(c, args, meta_data_train) # setup model - model = setup_model(num_chars, speaker_manager.num_speakers, c, d_vector_dim=speaker_manager.d_vector_dim) + model = setup_model(c) # restore model checkpoint = torch.load(args.checkpoint_path, map_location="cpu") @@ -283,4 +274,5 @@ if __name__ == "__main__": args = parser.parse_args() c = load_config(args.config_path) + c.audio.trim_silence = False main(args) From b4bb567e042edfeae01742220b36df9363863ab4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:45:52 +0200 Subject: [PATCH 092/258] Update `vocoder` utils --- TTS/vocoder/layers/losses.py | 25 ++++- TTS/vocoder/utils/generic_utils.py | 166 +---------------------------- 2 files changed, 25 insertions(+), 166 deletions(-) diff --git a/TTS/vocoder/layers/losses.py b/TTS/vocoder/layers/losses.py index 18076d85..9acdeea1 100644 --- a/TTS/vocoder/layers/losses.py +++ b/TTS/vocoder/layers/losses.py @@ -1,8 +1,12 @@ +from typing import Dict, Union + import librosa import torch from torch import nn from torch.nn import functional as F +from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss + class TorchSTFT(nn.Module): # pylint: disable=abstract-method """TODO: Merge this with audio.py""" @@ -374,7 +378,7 @@ class GeneratorLoss(nn.Module): feat_match_loss = self.feat_match_loss(feats_fake, feats_real) return_dict["G_feat_match_loss"] = feat_match_loss adv_loss = adv_loss + self.feat_match_loss_weight * feat_match_loss - return_dict["G_loss"] = gen_loss + adv_loss + return_dict["loss"] = gen_loss + adv_loss return_dict["G_gen_loss"] = gen_loss return_dict["G_adv_loss"] = adv_loss return return_dict @@ -419,5 +423,22 @@ class DiscriminatorLoss(nn.Module): return_dict["D_hinge_gan_fake_loss"] = hinge_D_fake_loss loss += hinge_D_loss - return_dict["D_loss"] = loss + return_dict["loss"] = loss return return_dict + + +class WaveRNNLoss(nn.Module): + def __init__(self, wave_rnn_mode: Union[str, int]): + super().__init__() + if wave_rnn_mode == "mold": + self.loss_func = discretized_mix_logistic_loss + elif wave_rnn_mode == "gauss": + self.loss_func = gaussian_loss + elif isinstance(wave_rnn_mode, int): + self.loss_func = torch.nn.CrossEntropyLoss() + else: + raise ValueError(" [!] Unknown mode for Wavernn.") + + def forward(self, y_hat, y) -> Dict: + loss = self.loss_func(y_hat, y) + return {"loss": loss} diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index cb45feb0..eeabbea5 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -1,6 +1,3 @@ -import importlib -import re - import numpy as np import torch from matplotlib import pyplot as plt @@ -29,7 +26,7 @@ def interpolate_vocoder_input(scale_factor, spec): return spec -def plot_results(y_hat, y, ap, global_step, name_prefix): +def plot_results(y_hat, y, ap, name_prefix): """Plot vocoder model results""" # select an instance from batch @@ -47,7 +44,7 @@ def plot_results(y_hat, y, ap, global_step, name_prefix): plt.title("groundtruth speech") plt.subplot(2, 1, 2) plt.plot(y_hat) - plt.title(f"generated speech @ {global_step} steps") + plt.title("generated speech") plt.tight_layout() plt.close() @@ -58,162 +55,3 @@ def plot_results(y_hat, y, ap, global_step, name_prefix): name_prefix + "speech_comparison": fig_wave, } return figures - - -def to_camel(text): - text = text.capitalize() - return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) - - -def setup_generator(c): - print(" > Generator Model: {}".format(c.generator_model)) - MyModel = importlib.import_module("TTS.vocoder.models." + c.generator_model.lower()) - # this is to preserve the WaveRNN class name (instead of Wavernn) - if c.generator_model.lower() == "wavernn": - MyModel = getattr(MyModel, "WaveRNN") - else: - MyModel = getattr(MyModel, to_camel(c.generator_model)) - if c.generator_model.lower() in "wavernn": - model = MyModel( - rnn_dims=c.wavernn_model_params["rnn_dims"], - fc_dims=c.wavernn_model_params["fc_dims"], - mode=c.mode, - mulaw=c.mulaw, - pad=c.padding, - use_aux_net=c.wavernn_model_params["use_aux_net"], - use_upsample_net=c.wavernn_model_params["use_upsample_net"], - upsample_factors=c.wavernn_model_params["upsample_factors"], - feat_dims=c.audio["num_mels"], - compute_dims=c.wavernn_model_params["compute_dims"], - res_out_dims=c.wavernn_model_params["res_out_dims"], - num_res_blocks=c.wavernn_model_params["num_res_blocks"], - hop_length=c.audio["hop_length"], - sample_rate=c.audio["sample_rate"], - ) - elif c.generator_model.lower() in "hifigan_generator": - model = MyModel(in_channels=c.audio["num_mels"], out_channels=1, **c.generator_model_params) - elif c.generator_model.lower() in "melgan_generator": - model = MyModel( - in_channels=c.audio["num_mels"], - out_channels=1, - proj_kernel=7, - base_channels=512, - upsample_factors=c.generator_model_params["upsample_factors"], - res_kernel=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - ) - elif c.generator_model in "melgan_fb_generator": - raise ValueError("melgan_fb_generator is now fullband_melgan_generator") - elif c.generator_model.lower() in "multiband_melgan_generator": - model = MyModel( - in_channels=c.audio["num_mels"], - out_channels=4, - proj_kernel=7, - base_channels=384, - upsample_factors=c.generator_model_params["upsample_factors"], - res_kernel=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - ) - elif c.generator_model.lower() in "fullband_melgan_generator": - model = MyModel( - in_channels=c.audio["num_mels"], - out_channels=1, - proj_kernel=7, - base_channels=512, - upsample_factors=c.generator_model_params["upsample_factors"], - res_kernel=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - ) - elif c.generator_model.lower() in "parallel_wavegan_generator": - model = MyModel( - in_channels=1, - out_channels=1, - kernel_size=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - stacks=c.generator_model_params["stacks"], - res_channels=64, - gate_channels=128, - skip_channels=64, - aux_channels=c.audio["num_mels"], - dropout=0.0, - bias=True, - use_weight_norm=True, - upsample_factors=c.generator_model_params["upsample_factors"], - ) - elif c.generator_model.lower() in "wavegrad": - model = MyModel( - in_channels=c["audio"]["num_mels"], - out_channels=1, - use_weight_norm=c["model_params"]["use_weight_norm"], - x_conv_channels=c["model_params"]["x_conv_channels"], - y_conv_channels=c["model_params"]["y_conv_channels"], - dblock_out_channels=c["model_params"]["dblock_out_channels"], - ublock_out_channels=c["model_params"]["ublock_out_channels"], - upsample_factors=c["model_params"]["upsample_factors"], - upsample_dilations=c["model_params"]["upsample_dilations"], - ) - else: - raise NotImplementedError(f"Model {c.generator_model} not implemented!") - return model - - -def setup_discriminator(c): - print(" > Discriminator Model: {}".format(c.discriminator_model)) - if "parallel_wavegan" in c.discriminator_model: - MyModel = importlib.import_module("TTS.vocoder.models.parallel_wavegan_discriminator") - else: - MyModel = importlib.import_module("TTS.vocoder.models." + c.discriminator_model.lower()) - MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower())) - if c.discriminator_model in "hifigan_discriminator": - model = MyModel() - if c.discriminator_model in "random_window_discriminator": - model = MyModel( - cond_channels=c.audio["num_mels"], - hop_length=c.audio["hop_length"], - uncond_disc_donwsample_factors=c.discriminator_model_params["uncond_disc_donwsample_factors"], - cond_disc_downsample_factors=c.discriminator_model_params["cond_disc_downsample_factors"], - cond_disc_out_channels=c.discriminator_model_params["cond_disc_out_channels"], - window_sizes=c.discriminator_model_params["window_sizes"], - ) - if c.discriminator_model in "melgan_multiscale_discriminator": - model = MyModel( - in_channels=1, - out_channels=1, - kernel_sizes=(5, 3), - base_channels=c.discriminator_model_params["base_channels"], - max_channels=c.discriminator_model_params["max_channels"], - downsample_factors=c.discriminator_model_params["downsample_factors"], - ) - if c.discriminator_model == "residual_parallel_wavegan_discriminator": - model = MyModel( - in_channels=1, - out_channels=1, - kernel_size=3, - num_layers=c.discriminator_model_params["num_layers"], - stacks=c.discriminator_model_params["stacks"], - res_channels=64, - gate_channels=128, - skip_channels=64, - dropout=0.0, - bias=True, - nonlinear_activation="LeakyReLU", - nonlinear_activation_params={"negative_slope": 0.2}, - ) - if c.discriminator_model == "parallel_wavegan_discriminator": - model = MyModel( - in_channels=1, - out_channels=1, - kernel_size=3, - num_layers=c.discriminator_model_params["num_layers"], - conv_channels=64, - dilation_factor=1, - nonlinear_activation="LeakyReLU", - nonlinear_activation_params={"negative_slope": 0.2}, - bias=True, - ) - return model - - -# def check_config(c): -# c = None -# pass From 8182f5168fda52069305749351bc24c0002ff91f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 14:26:11 +0200 Subject: [PATCH 093/258] Fixup `utils` for the trainer --- TTS/utils/generic_utils.py | 14 ++++++-------- TTS/utils/logging/tensorboard_logger.py | 2 ++ TTS/utils/manage.py | 2 +- TTS/utils/radam.py | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 67cd0bf5..e7c57529 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -16,9 +16,10 @@ import torch def to_cuda(x: torch.Tensor) -> torch.Tensor: if x is None: return None - x = x.contiguous() - if torch.cuda.is_available(): - x = x.cuda(non_blocking=True) + if torch.is_tensor(x): + x = x.contiguous() + if torch.cuda.is_available(): + x = x.cuda(non_blocking=True) return x @@ -57,13 +58,10 @@ def get_commit_hash(): return commit -def create_experiment_folder(root_path, model_name, debug): +def create_experiment_folder(root_path, model_name): """Create a folder with the current date and time""" date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p") - if debug: - commit_hash = "debug" - else: - commit_hash = get_commit_hash() + commit_hash = get_commit_hash() output_folder = os.path.join(root_path, model_name + "-" + date_str + "-" + commit_hash) os.makedirs(output_folder, exist_ok=True) print(" > Experiment folder: {}".format(output_folder)) diff --git a/TTS/utils/logging/tensorboard_logger.py b/TTS/utils/logging/tensorboard_logger.py index 657deb5b..3d7ea1e6 100644 --- a/TTS/utils/logging/tensorboard_logger.py +++ b/TTS/utils/logging/tensorboard_logger.py @@ -34,6 +34,8 @@ class TensorboardLogger(object): def dict_to_tb_audios(self, scope_name, audios, step, sample_rate): for key, value in audios.items(): + if value.dtype == "float16": + value = value.astype("float32") try: self.writer.add_audio("{}/{}".format(scope_name, key), value, step, sample_rate=sample_rate) except RuntimeError: diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index cf7df7de..86734c9f 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -137,7 +137,7 @@ class ModelManager(object): # set scale stats path in config.json config_path = output_config_path config = load_config(config_path) - config.external_speaker_embedding_file = output_speakers_path + config.d_vector_file = output_speakers_path config.save_json(config_path) return output_model_path, output_config_path, model_item diff --git a/TTS/utils/radam.py b/TTS/utils/radam.py index b6c86fed..73426e64 100644 --- a/TTS/utils/radam.py +++ b/TTS/utils/radam.py @@ -1,4 +1,4 @@ -# from https://github.com/LiyuanLucasLiu/RAdam +# modified from https://github.com/LiyuanLucasLiu/RAdam import math From 0e01c2594f2f75feb35a908f0f48f97e220ba94f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 14:26:41 +0200 Subject: [PATCH 094/258] Update `speaker_manager` --- TTS/tts/utils/speakers.py | 167 ++++++++++++++++++++------------------ 1 file changed, 88 insertions(+), 79 deletions(-) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 01e26c6b..5caa2fee 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -5,91 +5,13 @@ from typing import Any, Dict, List, Tuple, Union import numpy as np import torch +from coqpit import Coqpit from TTS.config import load_config from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.utils.audio import AudioProcessor -def _set_file_path(path): - """Find the speakers.json under the given path or the above it. - Intended to band aid the different paths returned in restored and continued training.""" - path_restore = os.path.join(os.path.dirname(path), "speakers.json") - path_continue = os.path.join(path, "speakers.json") - if os.path.exists(path_restore): - return path_restore - if os.path.exists(path_continue): - return path_continue - raise FileNotFoundError(f" [!] `speakers.json` not found in {path}") - - -def load_speaker_mapping(out_path): - """Loads speaker mapping if already present.""" - if os.path.splitext(out_path)[1] == ".json": - json_file = out_path - else: - json_file = _set_file_path(out_path) - with open(json_file) as f: - return json.load(f) - - -def save_speaker_mapping(out_path, speaker_mapping): - """Saves speaker mapping if not yet present.""" - if out_path is not None: - speakers_json_path = _set_file_path(out_path) - with open(speakers_json_path, "w") as f: - json.dump(speaker_mapping, f, indent=4) - - -def get_speaker_manager(c, restore_path, meta_data_train, out_path=None): - """Inititalize and return a `SpeakerManager` based on config values""" - speaker_manager = SpeakerManager() - if c.use_speaker_embedding: - speaker_manager.set_speaker_ids_from_data(meta_data_train) - if restore_path: - speakers_file = _set_file_path(restore_path) - # restoring speaker manager from a previous run. - if c.use_external_speaker_embedding_file: - # restore speaker manager with the embedding file - if not os.path.exists(speakers_file): - print( - "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" - ) - if not os.path.exists(c.external_speaker_embedding_file): - raise RuntimeError( - "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file" - ) - speaker_manager.load_d_vectors_file(c.external_speaker_embedding_file) - speaker_manager.set_d_vectors_from_file(speakers_file) - elif not c.use_external_speaker_embedding_file: # restor speaker manager with speaker ID file. - speaker_ids_from_data = speaker_manager.speaker_ids - speaker_manager.set_speaker_ids_from_file(speakers_file) - assert all( - speaker in speaker_manager.speaker_ids for speaker in speaker_ids_from_data - ), " [!] You cannot introduce new speakers to a pre-trained model." - elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: - # new speaker manager with external speaker embeddings. - speaker_manager.set_d_vectors_from_file(c.external_speaker_embedding_file) - elif ( - c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file - ): # new speaker manager with speaker IDs file. - raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" - print( - " > Training with {} speakers: {}".format( - speaker_manager.num_speakers, ", ".join(speaker_manager.speaker_ids) - ) - ) - # save file if path is defined - if out_path: - out_file_path = os.path.join(out_path, "speakers.json") - print(f" > Saving `speakers.json` to {out_file_path}.") - if c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: - speaker_manager.save_d_vectors_to_file(out_file_path) - else: - speaker_manager.save_speaker_ids_to_file(out_file_path) - return speaker_manager - - class SpeakerManager: """It manages the multi-speaker setup for 🐸TTS models. It loads the speaker files and parses the information in a way that you can query. There are 3 different scenarios considered. @@ -356,3 +278,90 @@ class SpeakerManager: def plot_embeddings(self): # TODO: implement speaker encoder raise NotImplementedError + + +def _set_file_path(path): + """Find the speakers.json under the given path or the above it. + Intended to band aid the different paths returned in restored and continued training.""" + path_restore = os.path.join(os.path.dirname(path), "speakers.json") + path_continue = os.path.join(path, "speakers.json") + if os.path.exists(path_restore): + return path_restore + if os.path.exists(path_continue): + return path_continue + raise FileNotFoundError(f" [!] `speakers.json` not found in {path}") + + +def load_speaker_mapping(out_path): + """Loads speaker mapping if already present.""" + if os.path.splitext(out_path)[1] == ".json": + json_file = out_path + else: + json_file = _set_file_path(out_path) + with open(json_file) as f: + return json.load(f) + + +def save_speaker_mapping(out_path, speaker_mapping): + """Saves speaker mapping if not yet present.""" + if out_path is not None: + speakers_json_path = _set_file_path(out_path) + with open(speakers_json_path, "w") as f: + json.dump(speaker_mapping, f, indent=4) + + +def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, out_path: str = None) -> SpeakerManager: + """Create a SpeakerManager instance based on provided configuration. + + Args: + c (Coqpit): Model configuration. + restore_path (str): Path to a previous training folder. + data (List): Data samples used in training to infer speakers from. It must be provided if speaker embedding + layers is used. Defaults to None. + out_path (str, optional): Save the generated speaker IDs to a output path. Defaults to None. + + Returns: + SpeakerManager: + """ + speaker_manager = SpeakerManager() + if c.use_speaker_embedding: + if data is not None: + speaker_manager.set_speaker_ids_from_data(data) + if restore_path: + speakers_file = _set_file_path(restore_path) + # restoring speaker manager from a previous run. + if c.use_d_vector_file: + # restore speaker manager with the embedding file + if not os.path.exists(speakers_file): + print("WARNING: speakers.json was not found in restore_path, trying to use CONFIG.d_vector_file") + if not os.path.exists(c.d_vector_file): + raise RuntimeError( + "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.d_vector_file" + ) + speaker_manager.load_d_vectors_file(c.d_vector_file) + speaker_manager.set_d_vectors_from_file(speakers_file) + elif not c.use_d_vector_file: # restor speaker manager with speaker ID file. + speaker_ids_from_data = speaker_manager.speaker_ids + speaker_manager.set_speaker_ids_from_file(speakers_file) + assert all( + speaker in speaker_manager.speaker_ids for speaker in speaker_ids_from_data + ), " [!] You cannot introduce new speakers to a pre-trained model." + elif c.use_d_vector_file and c.d_vector_file: + # new speaker manager with external speaker embeddings. + speaker_manager.set_d_vectors_from_file(c.d_vector_file) + elif c.use_d_vector_file and not c.d_vector_file: # new speaker manager with speaker IDs file. + raise "use_d_vector_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" + print( + " > Training with {} speakers: {}".format( + speaker_manager.num_speakers, ", ".join(speaker_manager.speaker_ids) + ) + ) + # save file if path is defined + if out_path: + out_file_path = os.path.join(out_path, "speakers.json") + print(f" > Saving `speakers.json` to {out_file_path}.") + if c.use_d_vector_file and c.d_vector_file: + speaker_manager.save_d_vectors_to_file(out_file_path) + else: + speaker_manager.save_speaker_ids_to_file(out_file_path) + return speaker_manager From c3a0bc702eda46ccf0210dcfa8dedf52283f69d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 14:27:28 +0200 Subject: [PATCH 095/258] fixup configs --- TTS/config/shared_configs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index a7976db7..801855c1 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -1,7 +1,7 @@ from dataclasses import asdict, dataclass from typing import List -from coqpit import MISSING, Coqpit, check_argument +from coqpit import Coqpit, check_argument @dataclass @@ -214,7 +214,7 @@ class BaseTrainingConfig(Coqpit): to 10000. num_loader_workers (int): Number of workers for training time dataloader. - num_val_loader_workers (int): + num_eval_loader_workers (int): Number of workers for evaluation time dataloader. output_path (str): Path for training output folder. The nonexist part of the given path is created automatically. @@ -243,8 +243,8 @@ class BaseTrainingConfig(Coqpit): keep_all_best: bool = False keep_after: int = 10000 # dataloading - num_loader_workers: int = MISSING - num_val_loader_workers: int = 0 + num_loader_workers: int = None + num_eval_loader_workers: int = 0 use_noise_augment: bool = False # paths output_path: str = None From 7dc2177df4506f8cba5de9d24380367124891d7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 14:49:06 +0200 Subject: [PATCH 096/258] Update `synthesizer` for speaker and model init --- TTS/utils/synthesizer.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 8f510f20..365ab8bd 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -6,7 +6,7 @@ import pysbd import torch from TTS.config import load_config -from TTS.tts.models import setup_model +from TTS.tts.models import setup_model as setup_tts_model from TTS.tts.utils.speakers import SpeakerManager # pylint: disable=unused-wildcard-import @@ -14,7 +14,8 @@ from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.synthesis import synthesis, trim_silence from TTS.tts.utils.text import make_symbols, phonemes, symbols from TTS.utils.audio import AudioProcessor -from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input, setup_generator +from TTS.vocoder.models import setup_model as setup_vocoder_model +from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input class Synthesizer(object): @@ -98,7 +99,7 @@ class Synthesizer(object): self.speaker_manager = SpeakerManager( encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config ) - self.speaker_manager.load_d_vectors_file(self.tts_config.get("external_speaker_embedding_file", speaker_file)) + self.speaker_manager.load_d_vectors_file(self.tts_config.get("d_vector_file", speaker_file)) self.num_speakers = self.speaker_manager.num_speakers self.d_vector_dim = self.speaker_manager.d_vector_dim @@ -127,16 +128,11 @@ class Synthesizer(object): if self.tts_config.use_speaker_embedding is True: self.tts_speakers_file = ( - self.tts_speakers_file if self.tts_speakers_file else self.tts_config["external_speaker_embedding_file"] + self.tts_speakers_file if self.tts_speakers_file else self.tts_config["d_vector_file"] ) - self._load_speakers(self.tts_speakers_file) + self.tts_config["d_vector_file"] = self.tts_speakers_file - self.tts_model = setup_model( - self.input_size, - num_speakers=self.num_speakers, - c=self.tts_config, - d_vector_dim=self.d_vector_dim, - ) + self.tts_model = setup_tts_model(config=self.tts_config) self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda() @@ -151,7 +147,7 @@ class Synthesizer(object): """ self.vocoder_config = load_config(model_config) self.vocoder_ap = AudioProcessor(verbose=False, **self.vocoder_config.audio) - self.vocoder_model = setup_generator(self.vocoder_config) + self.vocoder_model = setup_vocoder_model(self.vocoder_config) self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True) if use_cuda: self.vocoder_model.cuda() From 77d57dd301b915c75d90e4961548a2b7fdb06876 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 14:52:27 +0200 Subject: [PATCH 097/258] Print `max_decoder_steps` when model reaches the limit --- TTS/tts/layers/tacotron/tacotron2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/layers/tacotron/tacotron2.py b/TTS/tts/layers/tacotron/tacotron2.py index 61fe9f4b..9c33623e 100644 --- a/TTS/tts/layers/tacotron/tacotron2.py +++ b/TTS/tts/layers/tacotron/tacotron2.py @@ -357,7 +357,7 @@ class Decoder(nn.Module): if stop_token > self.stop_threshold and t > inputs.shape[0] // 2: break if len(outputs) == self.max_decoder_steps: - print(" | > Decoder stopped with 'max_decoder_steps") + print(f" > Decoder stopped with `max_decoder_steps` {self.max_decoder_steps}") break memory = self._update_memory(decoder_output) From 220e184f66923abe8f5ff3fc184cc73e5c3ad333 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 14:54:04 +0200 Subject: [PATCH 098/258] Apply small fixes for API compatibility --- TTS/tts/tf/utils/generic_utils.py | 2 +- notebooks/PlotUmapLibriTTS.ipynb | 4 +++- notebooks/TestAttention.ipynb | 4 +++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/TTS/tts/tf/utils/generic_utils.py b/TTS/tts/tf/utils/generic_utils.py index e76893c2..91434a38 100644 --- a/TTS/tts/tf/utils/generic_utils.py +++ b/TTS/tts/tf/utils/generic_utils.py @@ -83,7 +83,7 @@ def setup_model(num_chars, num_speakers, c, enable_tflite=False): num_chars=num_chars, num_speakers=num_speakers, r=c.r, - postnet_output_dim=c.audio["num_mels"], + out_channels=c.audio["num_mels"], decoder_output_dim=c.audio["num_mels"], attn_type=c.attention_type, attn_win=c.windowing, diff --git a/notebooks/PlotUmapLibriTTS.ipynb b/notebooks/PlotUmapLibriTTS.ipynb index 97f9800d..0448f3df 100644 --- a/notebooks/PlotUmapLibriTTS.ipynb +++ b/notebooks/PlotUmapLibriTTS.ipynb @@ -33,7 +33,9 @@ "from TTS.utils.audio import AudioProcessor\n", "from TTS.utils.io import load_config\n", ======= - "from TTS.tts.utils.audio import AudioProcessor\n", + "from TTS.utils.audio import AudioProcessor + +\n", "from TTS.tts.utils.generic_utils import load_config\n", >>>>>>> dev "\n", diff --git a/notebooks/TestAttention.ipynb b/notebooks/TestAttention.ipynb index ed1c245b..5d8eed85 100644 --- a/notebooks/TestAttention.ipynb +++ b/notebooks/TestAttention.ipynb @@ -37,7 +37,9 @@ "import librosa.display\n", "\n", "from TTS.tts.layers import *\n", - "from TTS.tts.utils.audio import AudioProcessor\n", + "from TTS.utils.audio import AudioProcessor + +\n", "from TTS.tts.utils.generic_utils import setup_model\n", "from TTS.tts.utils.io import load_config\n", "from TTS.tts.utils.text import text_to_sequence\n", From 7de2756fc493271f52bebeaadaded9aaddc67af3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 14:57:05 +0200 Subject: [PATCH 099/258] =?UTF-8?q?Enable=20support=20for=20=F0=9F=90=8D?= =?UTF-8?q?=20python=203.10?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bump up versions numpy 1.19.5 and TF 2.5.0 --- TTS/tts/datasets/__init__.py | 4 ---- pyproject.toml | 2 +- requirements.tf.txt | 2 +- requirements.txt | 2 +- setup.py | 5 ++--- 5 files changed, 5 insertions(+), 10 deletions(-) diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index bcdbf6a6..cbae78a7 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -7,10 +7,6 @@ import numpy as np from TTS.tts.datasets.formatters import * from TTS.tts.datasets.TTSDataset import TTSDataset -#################### -# UTILITIES -#################### - def split_dataset(items): speakers = [item[-1] for item in items] diff --git a/pyproject.toml b/pyproject.toml index feaf5fd4..0941a906 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools", "wheel", "Cython", "numpy==1.18.5"] +requires = ["setuptools", "wheel", "Cython", "numpy==1.19.5"] [flake8] max-line-length=120 diff --git a/requirements.tf.txt b/requirements.tf.txt index 60f6e6c9..8e256a90 100644 --- a/requirements.tf.txt +++ b/requirements.tf.txt @@ -1 +1 @@ -tensorflow==2.3.1 +tensorflow==2.5.0 diff --git a/requirements.txt b/requirements.txt index fde48978..bc69481a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ inflect jieba librosa==0.8.0 matplotlib -numpy==1.18.5 +numpy==1.19.5 pandas pypinyin pysbd diff --git a/setup.py b/setup.py index 7cfb6519..b4015455 100644 --- a/setup.py +++ b/setup.py @@ -11,9 +11,8 @@ import setuptools.command.develop from Cython.Build import cythonize from setuptools import Extension, find_packages, setup - if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) > LooseVersion("3.9"): - raise RuntimeError("TTS requires python >= 3.6 and <3.9 " "but your Python version is {}".format(sys.version)) + raise RuntimeError("TTS requires python >= 3.6 and <=3.10 " "but your Python version is {}".format(sys.version)) cwd = os.path.dirname(os.path.abspath(__file__)) @@ -99,7 +98,7 @@ setup( "notebooks": requirements_notebooks, "tf": requirements_tf, }, - python_requires=">=3.6.0, <3.9", + python_requires=">=3.6.0, <3.10", entry_points={"console_scripts": ["tts=TTS.bin.synthesize:main", "tts-server = TTS.server.server:main"]}, classifiers=[ "Programming Language :: Python", From d2d914cbc0c76b60d3802297129450936dfa327c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 15:01:39 +0200 Subject: [PATCH 100/258] Update Pylint configuration --- .pylintrc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.pylintrc b/.pylintrc index 34c121eb..7293f5ad 100644 --- a/.pylintrc +++ b/.pylintrc @@ -61,6 +61,9 @@ confidence= # no Warning level messages displayed, use "--disable=all --enable=classes # --disable=W". disable=missing-docstring, + too-many-public-methods, + too-many-lines, + bare-except, line-too-long, fixme, wrong-import-order, From 787f6177c0d0612ddd761d2b1d98d3667a74058a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 15:02:02 +0200 Subject: [PATCH 101/258] Update gitignore --- .gitignore | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 7ca905ff..c4647723 100644 --- a/.gitignore +++ b/.gitignore @@ -124,7 +124,9 @@ version.py # jupyter dummy files core +# files used internally fro dev, test etc. tests/outputs/* +tests/train_outputs/* TODO.txt .vscode/* data/* @@ -132,7 +134,21 @@ notebooks/data/* TTS/tts/layers/glow_tts/monotonic_align/core.c .vscode-upload.json temp_build/* -recipes/* - -# nohup logs +recipes/WIP/* +recipes/ljspeech/LJSpeech-1.1/* +events.out* +old_configs/* +model_importers/* +model_profiling/* +docs/* +.noseids +.dccache +log.txt +umap.png *.out +SocialMedia.txt +output.wav +tts_output.wav +deps.json +speakers.json +internal/* \ No newline at end of file From 9cb1062736e55a7238f74f53392e97df2b5b3e01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 15:09:40 +0200 Subject: [PATCH 102/258] Create LJSpeech recipes for all the models --- recipes/ljspeech/README.md | 19 ++++ recipes/ljspeech/download_ljspeech.sh | 14 +++ recipes/ljspeech/glow_tts/train_glowtts.py | 30 +++++++ recipes/ljspeech/hifigan/train_hifigan.py | 30 +++++++ .../train_multiband_melgan.py | 30 +++++++ recipes/ljspeech/tacotron2-DCA/run.sh | 22 +++++ .../ljspeech/tacotron2-DCA/scale_stats.npy | Bin 0 -> 10700 bytes .../ljspeech/tacotron2-DCA/tacotron2-DCA.json | 85 ++++++++++++++++++ .../ljspeech/tacotron2-DDC/scale_stats.npy | Bin 0 -> 10700 bytes recipes/ljspeech/wavegrad/train_wavegrad.py | 29 ++++++ recipes/ljspeech/wavernn/train_wavernn.py | 30 +++++++ 11 files changed, 289 insertions(+) create mode 100644 recipes/ljspeech/README.md create mode 100644 recipes/ljspeech/download_ljspeech.sh create mode 100644 recipes/ljspeech/glow_tts/train_glowtts.py create mode 100644 recipes/ljspeech/hifigan/train_hifigan.py create mode 100644 recipes/ljspeech/multiband_melgan/train_multiband_melgan.py create mode 100644 recipes/ljspeech/tacotron2-DCA/run.sh create mode 100644 recipes/ljspeech/tacotron2-DCA/scale_stats.npy create mode 100644 recipes/ljspeech/tacotron2-DCA/tacotron2-DCA.json create mode 100644 recipes/ljspeech/tacotron2-DDC/scale_stats.npy create mode 100644 recipes/ljspeech/wavegrad/train_wavegrad.py create mode 100644 recipes/ljspeech/wavernn/train_wavernn.py diff --git a/recipes/ljspeech/README.md b/recipes/ljspeech/README.md new file mode 100644 index 00000000..94508a7f --- /dev/null +++ b/recipes/ljspeech/README.md @@ -0,0 +1,19 @@ +# 🐸💬 TTS LJspeech Recipes + +For running the recipes + +1. Download the LJSpeech dataset here either manually from [its official website](https://keithito.com/LJ-Speech-Dataset/) or using ```download_ljspeech.sh```. +2. Go to your desired model folder and run the training. + + Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```) + ```terminal + CUDA_VISIBLE_DEVICES="0" python train_modelX.py + ``` + + Running bash scripts. + ```terminal + bash run.sh + ``` + +💡 Note that these runs are just templates to help you start training your first model. They are not optimized for the best +result. Double-check the configurations and feel free to share your experiments to find better parameters together 💪. diff --git a/recipes/ljspeech/download_ljspeech.sh b/recipes/ljspeech/download_ljspeech.sh new file mode 100644 index 00000000..14ef058d --- /dev/null +++ b/recipes/ljspeech/download_ljspeech.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# take the scripts's parent's directory to prefix all the output paths. +RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +echo $RUN_DIR +# download LJSpeech dataset +wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 +# extract +tar -xjf LJSpeech-1.1.tar.bz2 +# create train-val splits +shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv +head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv +tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv +mv LJSpeech-1.1 $RUN_DIR/ +rm LJSpeech-1.1.tar.bz2 \ No newline at end of file diff --git a/recipes/ljspeech/glow_tts/train_glowtts.py b/recipes/ljspeech/glow_tts/train_glowtts.py new file mode 100644 index 00000000..0a3c3838 --- /dev/null +++ b/recipes/ljspeech/glow_tts/train_glowtts.py @@ -0,0 +1,30 @@ +import os + +from TTS.tts.configs import GlowTTSConfig +from TTS.tts.configs import BaseDatasetConfig +from TTS.trainer import init_training, Trainer, TrainingArgs + + +output_path = os.path.dirname(os.path.abspath(__file__)) +dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")) +config = GlowTTSConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + text_cleaner="english_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + print_step=25, + print_eval=True, + mixed_precision=False, + output_path=output_path, + datasets=[dataset_config] +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger) +trainer.fit() diff --git a/recipes/ljspeech/hifigan/train_hifigan.py b/recipes/ljspeech/hifigan/train_hifigan.py new file mode 100644 index 00000000..99b39e99 --- /dev/null +++ b/recipes/ljspeech/hifigan/train_hifigan.py @@ -0,0 +1,30 @@ +import os + +from TTS.vocoder.configs import HifiganConfig +from TTS.trainer import init_training, Trainer, TrainingArgs + + +output_path = os.path.dirname(os.path.abspath(__file__)) +config = HifiganConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + seq_len=8192, + pad_short=2000, + use_noise_augment=True, + eval_split_size=10, + print_step=25, + print_eval=True, + mixed_precision=False, + lr_gen=1e-4, + lr_disc=1e-4, + data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), + output_path=output_path, +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger) +trainer.fit() diff --git a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py new file mode 100644 index 00000000..6b766ab7 --- /dev/null +++ b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py @@ -0,0 +1,30 @@ +import os + +from TTS.vocoder.configs import MultibandMelganConfig +from TTS.trainer import init_training, Trainer, TrainingArgs + + +output_path = os.path.dirname(os.path.abspath(__file__)) +config = MultibandMelganConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + seq_len=8192, + pad_short=2000, + use_noise_augment=True, + eval_split_size=10, + print_step=25, + print_eval=True, + mixed_precision=False, + lr_gen=1e-4, + lr_disc=1e-4, + data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), + output_path=output_path, +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger) +trainer.fit() diff --git a/recipes/ljspeech/tacotron2-DCA/run.sh b/recipes/ljspeech/tacotron2-DCA/run.sh new file mode 100644 index 00000000..8bcd9e3d --- /dev/null +++ b/recipes/ljspeech/tacotron2-DCA/run.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# take the scripts's parent's directory to prefix all the output paths. +RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +echo $RUN_DIR +# # download LJSpeech dataset +# wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 +# # extract +# tar -xjf LJSpeech-1.1.tar.bz2 +# # create train-val splits +# shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv +# head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv +# tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv +# mv LJSpeech-1.1 $RUN_DIR/ +# rm LJSpeech-1.1.tar.bz2 +# # compute dataset mean and variance for normalization +# python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/LJSpeech-1.1/wavs/ +# training .... +# change the GPU id if needed +CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path $RUN_DIR/tacotron2-DCA.json \ + --coqpit.output_path $RUN_DIR \ + --coqpit.datasets.0.path /media/erogol/nvme_linux/gdrive/Projects/TTS/recipes/ljspeech/tacotron2-DDC/LJSpeech-1.1/ \ + --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ \ No newline at end of file diff --git a/recipes/ljspeech/tacotron2-DCA/scale_stats.npy b/recipes/ljspeech/tacotron2-DCA/scale_stats.npy new file mode 100644 index 0000000000000000000000000000000000000000..1dc577a68253f87470e66444db4f19e583dc7adb GIT binary patch literal 10700 zcmbulc|4Te8$Ukwy^TUDyHb`UmB^)(lr<@9$ugF*jV+2oN<>MEM?yqNR7y&vbEl1z zrAUQrm6>5?gu!5DzUQgW^ZmYlfB)u>xnAcw*Ltq&oO|Ydp9^+-Y#m$$NYSLD%YDfa zKHC6ik0*XC;NnkL_~&1`9w-mNPndyLwx_vQUt6?Cnq@sbvmd9>Q8;?RxC&8%cWqNT%$moO|y z7=)6Bml+8Wi2*-eH69frh6P!@4pbpRs(VO9Kwp@6w*010vyU*5K5m@++(DSQy!hFz zI6q;+ShGd5?UNAECz&=&6bKRF??3EzJS;?b`NYK2b_fwNif*hY>x76WpG6{=QbNQL z+m?e9N5}G%zoDf=c0$K4HRgxm59+Zbeazg=O(KMl!npFX zaS_7XX}`v(uPAX}>*bnNYs84FEuH^Rb;JoN>6*A3qvC{!n&`3kcnPBOX>0G7YZ8Qq z$L3+7L`?=_XQqYSKw{EYdIY(xi*Eb@6h0hn?{a& zG8G19+Akicr9#n_M*da?70$HlHeczX!sV{)l%hx~oJxK_#!sO_sfv-#<_;=M-50)A z_<;(P!jmO&m@hkT-nn86Do9tnezPNl3O(o7Ui25F!FE|`%f!1>5Lofu)BZgb6wmc+ z{V+>~eM;5aUeKvv*#1VK=`ML8tD4L3lhp zFRM+S3Mr@EyjjLnC|OhOdDRK~?<ntd*=vI^ zxu=y>RXqrgB2rej@dhEK$a%HFVk#t0TPd&JM}@mCm&7DLQ^DY$qWc@HXmE*oj%6H5 zgW$ODvGjv9*gMmDSUHadQrB(Ex60AsmA_()_f2QuD-Zf#(|_P$%Ai60o?J(GH49h(8;OIqHo z6lVfOIg+B{?IMMnHYhNJ>=pj^cvk(dan=59Ty;l^25wwi0l||L&3`*rd>)B(cs4+7 zGE9Il<7(+%mnRXncPyn>Zl6KvJ5usSou^QSP{h=!eQabE_2Y^wX#}O28mPD#(9nV8 zfRFaN1E_xgdew_YUFew(cSY;v&uHasxBl1jKcnw`+@a@J+L3_Is_!I+&nV{INMlWM z2O5-=^>ppsTT#&puHoD(pAhm&7jW9rj@-YT`5YY9fwRoFj1qUDDS)#3oVro%l{b7LU9Mg z&V{tH5V`Wn8xuYYT@ZYHW!J$G)NowEoZyTg`={HARvr9}-p$c5Eq(MGUGa%}u~~i; z?btLLlRIw|l?TS|drKKbE-kLBzsZgv1;ZOA_npR2nTon{RoNK2{>w|Wk~@aNV#<3= zgvU{E_E_2+y>Vok=seeP-#8L|x!=+wbsXuwqg0v~kE1(Q6V3}V#?f_C*%Zg9an!RQ zfZt^>j3@wZsLpM!VtJ>O(q0QJv0!lz^*0*ak)CPH_>EK@xLP8Ozfqi+zuR2aZxqw`OGS4Mwl`sg z?yMa}y@q--Az`D)wlJO1*))pSa#z<_tr|m3nR!#sc8sBk*_xQ_%rPVmucZwK$B^yC z=ly4O#!Jjy6C4pX`lK7$+Goy2p}lXUbC0r7 zoWiFlp)@=eJ~Y>%ij69IZgk|dvC#o<(Ea|4jiR0g`LF)QMh`^v6FR%th;-RIVeLmY zI#s89prDS8((Qe>_r7AI{J!5O6)M>1xuxLwm5EY2Ao%WTyUo?2g?y5-D4k zK506RHkRB|wVWD5zcWZ$s$FA9-R^K^Z08ti@ywaxw2vX~s}{au(->0ij$5{>e+=md zpq^tLW2h)=NY(et7%J2znf>}bhIBo7Pn2-q)wdj^md+nX(9)~of!AB>m4|(-+c+|) z3f6u{9!I%m*|#@b7)NWHQ$*Dtk0VVJi$YeNT%a@s(^eSM_7_PCUNHnDLS~U?YL~=c_V| z*obm2y7$9cHfsNV`Q_LeHrlCNGmmP*M%QG6=WoUSUx{tFz0eBxpXNoHod+8^<>)V& zCgVPeJJkOG#~c0A%_kXCdH+tGO~p9A9Rd|5_t~gp_ZwzNJsWL)S|2CZ&qe{s-O^Pf zY_vCSH1Vy>1Tyt%DA}Yqfr`q7YH1rM(4Ht?e%oeDr~US>_MAY~=Ui`u`%R#|%(3b3 z2@~k%zL1hHrzX&@BG)3@s}m@!(o!ws#sqR&DDhC_A(qp+`t#1y3H0IpOqWK(1WI~g zWu(_TfjEw<49`&~P^jJs-x%%$dTQ0VXpzJuI(PVWl+3b8v~S0zX7{C&$T?7GZn^0s zvUo0cKH6dui55;RJK!;i99v&?%X&_ty9SbDZAT|jyuE+%jkrlvT6^cc%-Knl>b^6| z@8Tp{ym8sC=8{Pidf{sDy2?qE@Xqhm?R%4`M=SAs@|#K2`K{kXfS5#s%WlSLcTA$L zT_*pOHccW%+%7qXL98bM>s&uiqOqQ(Vh2Vhk?DKI<3A=Qk!$ZzN0RImDmy;Ym#RF4 zcw@3Y{iajs?W=t4lKoR?tS!vY{lpadC2OH}vU&Kvr~aQJPl8wc6;@AW)@|F#rf3TPau;UH&8In{_C9Q0C6WJ%VlX=J{=@t}m$G&0pQ zx=y|}jndaz27GIoMs|l??zhO#pscUQBF8OfP|Jps0!z|nP{#ad_fld8&HD8hT%Dak zgluQGkl8F+n)Hql6F7?^585ms-JV6~Jd10q8)uP{wUL0;94>NP=N44Env1rj-Ex`@ z;387U#i^bnT$Fl@?`l!NMHV}go0e2@kx1-WL(vW{dP2`=Upm4?1(VFTb@O;gcV(@# zkTwr(@Ki;k+j;1~JeyqZVIJzpI)6I1kcW1--|FAf#zRk|NJ-N(JXAkD?D$ogkD|Js zcRVrUqX)~T_9b}n(XF~2fiDm7k!!Av&9_)S%9%-`GfMpk9HvYE9_Ai9mp0eR1 zWtCyew_1FZNz!l2GROSFh1OQae8ju`Ooy|Ij|^{`JzIrowj${teMvs5&ddB%F2qN3 zd@eTNfSVA-%6L9a2_2 zR4U`#)T+iq&5`N6VG$l$LyBC=?%^WIgntY|UU1Q{hi2pAVlGO{GJQKi=At`&Hrtb( zxJWcPCiT)vF0$S7X=j!I7cH&*DXR2)77@lXGpVh!=)x|8I*H0zo6PIVU1a*#y@nn5E+S?U(`GwAzAX5-ukGiYP|`@Y1pGpION zv_&{}26d*uxm#{Ch%+;)E4Y3J8D|*2{et^4RL*j>D`^I)7?*~McTb}VhDdko>uHo* z5$?aRbQ)bUc{TXy(ll~fv^3~&{50Bj>8dw3U>bcH&{N&!JdHAZ=3Csdm`0#}%g3(TUsLolOg;k+8uOFM7^2TJ`dpVKt9~ath+(uFY`J&7VvkJ0=Hl-K);% z406yYT9~u0hl4x?51%dSUc} z=R1~DUR0h{i05l9Z|TMiaL`zdamD9v9OO3oV#DRtWEThhqW#D^fcYP8g!^lJ z|LnZVLCp_srYZT@Py5@AE!Q|`EoYr@ zYcU52RK0+?7zSA$aAnV;XB-qiepxI5`@6m2dyO%U>!2LFaT|++WWFreSSvV zpe&w7)A?Ob%~nsNV{)CwH}Lr~H-1-x+o5T62IAK~%bG^~RU`6^mD9+hXw`<=o_(C` z7|lf|;p|9C5*MwteV2ar8isA$=06j9#6{|cbDm8$a1k;7)-8y@mH`Y7A1lZO;s7Qu_(JapZB!>(`%K3WsHy$Si+1;)UOFGU+2fpX=&Lr?rr|eO| z_2;qPUEOx$Qa<9QUNaqfz(*F}em@QWfbAMNm#Vb$k&~jdiV>BMtVVpcOpftU?5=yY z-XbJo`^tDZDQObXt#e~yEB;?sdo*IcrUr>HPf5ww)*%teZNaoUQxb7oJ|5V#B-f8bY!-SoZ_t-Si08n=ZcJtMtX6S7 zNJNQ}$s)D=BqCJ)!Ox{`BqIF!mRU7-5+TE@opWb5iAX3`PrZq0bxnfKVjNeL<(AP{ zOw;RwZJyxxd@q>!j9_ZuxcY_**0bq1o!;$9A_}c;CrkyCh*ZZfo2p|;gjp5m_<;-( zK_l1rMHgUuB6l`Etio~R7-_9Sg6Hq_<|6709p zc0NOb#4zoN{vr&+b*BjI{Coyer66s)pF+gSWe3!^Y!fD`@-OXoD;FlJc@`I%7K;!i zw#W0%WQq_vQ1i}TM3ku5>27P1EK108hV0vf#0bl^hV64w#0a?@wW-&0#RT_P+qaN=M47{{RG? z{{Vz^?OjEsi+{m>ZS4iyYI@;@?C^f7S~m!Mujpzx)(H*KXJ7e>euZ7)5=kp*kYtwK2f>aB##>e$^U>Z+Eh|SO;KEnR@O*$P+TKTGZ{xtZ~uA^s;p{}*) z)tTLfRQR11ws*HF6*i^waxUA@fl9WcnRd`&Nm*WRsmu^e8B5=Dn52W(!^;cn4ToUH zSK(&fE*j*IMbp^}Xi&aeV@XU} z`qjT{+G72U_AeDS(V*7W^V7gW8Wi0P?mX~=3OWZ23@?7B!Mj$e#w*`xa8O{w;yYhy zVC5t8P<$H=Dpa&}7iQAn#81`C4;?h<+R}gXLmdrTz1!JBeKhcN*n4VuB@I5!_8)U9 z$I#W1Bj>XWXmEG(>`2Q}8gPkxXR|Fduvs&F9({|=ce+-p@+i_*AFT;JXY=I((Dr%i*$-QBx>3DMxj*6A&|7-~DE$6VD` zMujw$lg_g_RM^lGo@XycgH@4_LQ2?FSTnWA;P)7Y>SPKYH)6Y5{k7ie<}|o^zM)DT zLuzNvC=~T$KY~7Q#lPctT)N1EXK@_s?LVpR4x+)@#OYk21YD0kouh(y{cE&#`)MW7 zpyk*@&1*3<@VW4uWSNEY`nq1ADV_!i{wf)oM`$4CvHFC|P8zU!75W$8xIfFpyfVUZ z4JaN6G3=zm(S}L77={jiw2nVG{Bn*k50>;@NuKUF`Z+FEd8i(Vyy6RJEq*aL~hG| zDy+YDBz%Pz6&Rr_4)!0WLfbx>kg`+^)hg^WMXn=lXO=x|DA0 zgZInMN&8`A6FNNa+MLwBgbry{$VdXrX?}RM7rqoNMQ&!?SpnQ51%`+@!OtYs9e{y zVZQ>dgL=E=jvOT#j1@`-S6k9x59wjo?LZpXJrDPMO`(BFP4TzNMC|{>ORcb28l=A1 z*2%&B8AXnB@xgV|8-L-C3TPl1a@u1zroknwt?R33;NQJPMLQ4g|KXyyYbx=%P*9ri z3eT5h85{N&;r?}V4ZQZ021cX*IPJyf&qos<*K>Hhk{@(1;xP@joXop&r3If$?iX&d z+Gudr{I$v@Cf*-OLN|(LXkdS>z&1se4o?S$6_l0epmgF^_phr-Q{t?c1h_bkH)&`;u@H*F)y&@Ei?)0-Zyj?Pp=Vff>^;W$r@ zeD1>INAa1w7AhTB}`qbieLm(61aBzXe5UhJFzI=`45S&ZP*?Ql72sU?U zJXP@@f;}z}n0&ObW@6_>0+>Pm;eoALDvvtkI`lp;58Xc~e!Ip~4gM{LKl zI@yFZ1V&FKUwMxX!J(Z|a9(B@3Z7g}mQ@~xmy7Q11N~t*S*?^9VLJ=~x%)JZdJIF3 z&cormM}|RD_2s4$hlU~E;KiUy>M#V}&trW$H4LF~UMtq$9|qa$)t*y#hCxk@9i@$5 zDIoLvk9xZCs|7Jl;ii#ckZ3Ny_fUWV*PGIArYbVv(bYD2vH=4`#ZQQDv}C~Ow>4o~ z$P8#nNSbg=WPsy@>h0Ax86Z@V_vv#Z1KRdWjkr%S;6%3Hi(y43EHhg0h_GjZUha;f zw@yqLcS^il9>WAXxny>~HxtZCy%xQVXM*et?)CI=ChQC{VQ)Fj1n!(3|1)Qp&^{I{ z!_8qrpK@KM>wPBBr)J(Nb}`}c=9;EEb6L>#57^4Xsz+6X*6x>|#;KLYQ` zS{@xMM?m_0(fd8CMj*{+vGzSo`ENTuQL+3bkq;V7?Gb3ZVtDR_`Un_DG|w*58iC?U z@rXUDBk*cf*`%KG2%vM^Vr#h(*wCJ3A&UJ`KR$>`k{N+#mJeNQF#qWs$#oNgBQW~r zWqD>d3nY%7WRsXIxO6FCiRu&!(hfa-UfRckkSc#Wr69n zC&#Z*SYWlLtKpG13p~zSi?2&$!Iv#37w4w1z$) z$H*61Q1W(Bu~rf0^VPWOH&~!o9P=#Y77NNBY@U|Kbv|}FwCYPHh8M6vyx42tHui6m+7>0B z!GcqgrsTe47Q|KT>~1}R$eLUJ#g7GS<7R&1E*8|v5&B>4 zu-v<-!%DkZ@J&wgR}U9cnFN0<}&0{760%j(iHp+jy3VUnwqXP{sR>$S@`^NMb^IeR$8NFedm`en^Hu zCj1`wAj;dt1o2yH4l&kD=<u$44GPRi?UGu^qL755P%C6^s#jhmI=gVbQ z$S`4FeZYYaGYqh2JpHzW!vN`?rf*tw28gFP7IpP8ARv8^wWX5*4r9eN(H|MGB6t4R z?f4wwzOt`aS;v3{&-4dwzhS`Tm~Xq?9^vz7xrSxUV+IhaDs&lw0iRM!4~{;^dTyJd zufJo!)BKcZCLSMhEh|dIb{=PKE|{!kz_xor+cskU;=(ks&-WNmQGcb!@Gb)=C%1pR zTg-so?Bf-=*$l8?i(To=#OK&X*|p&@3~-;W)#YH?lDGMHwQ^T-E?j2A1$1w1o7g`xN55sZmXH4CK^7lz~_C%swIm|74R!& znS#Au^BAyeefwp5EFb49z+8asOr*x$m^YsRWr9InF0u@GD>y1zkNwViAiu=_)K=yex zjzgg`SEvd1+wZlaOPh)q(Auy~C+jkPe%d_E&A@r6ZPhJ_KEr^8T$x9eNesBd6f5Cj z`bvqemy7cj7OXU}4Z!iZuYlFSRL*g z@h{cN_;V@EriVoCwc{BK=U}GiP7) zKfQ3o%JHdV@OMykT6i^1r3?SR;LfqI>;sB!Bt_5LMH&MLUQxb*p`Q4PnqQzlML(Xh z`tPkVKfg%Nh`=Lcih(Uj_z%)>I55OBh#cY{89-U{A1@#@>_48NElI#-4mRi)?iEb- zj0gzC7J?%vYwcX-{$K82{eRg;b}llQ9TAD8!n}MvgF}7Ely!e@@`RK9g2+CRl=XkE zNJMxAhXs*6!@VNOlnu6Od;i>N3n%|&Q;bvp2!(}{$-!X(UJ-#2l#Sc5-9Mi{`C)4~ z#Atj8&exk_VlDT_t#7DjL}*lquV+YTcre8@^-qYH85tfJjMK(p`j9D`PX7t#4-e~_ zS)00uV7pjy@X!y_o>Sp47XclcHyzTXiPLD_8W z@PF^)`2`1tP_}G0-|;^a|J~E`3-*ekSZp`1z4vFrMFNwEFtU%QzZX{6YAx`0Ndo;t zyn_D5zU}m%EU~@+m!;*-tN1?PpJGJB1_zUI20nkKx1atSf>+GnvDJ2Se68f)&mUJl zL4jd^Cw83vQ}c*OugC~oA6)aD7`P*m^n=O1fnNIL@KFEIApMYNTxooVB`QYW-#0uk znyl{-9(oYhE<)eQX}3PU`4|{Rj?l;dal*)Cp8)+xFQ3rJ@X(Mox?8te=tuZ?;lloL Tp%)SsORMEM?yqNR7y&vbEl1z zrAUQrm6>5?gu!5DzUQgW^ZmYlfB)u>xnAcw*Ltq&oO|Ydp9^+-Y#m$$NYSLD%YDfa zKHC6ik0*XC;NnkL_~&1`9w-mNPndyLwx_vQUt6?Cnq@sbvmd9>Q8;?RxC&8%cWqNT%$moO|y z7=)6Bml+8Wi2*-eH69frh6P!@4pbpRs(VO9Kwp@6w*010vyU*5K5m@++(DSQy!hFz zI6q;+ShGd5?UNAECz&=&6bKRF??3EzJS;?b`NYK2b_fwNif*hY>x76WpG6{=QbNQL z+m?e9N5}G%zoDf=c0$K4HRgxm59+Zbeazg=O(KMl!npFX zaS_7XX}`v(uPAX}>*bnNYs84FEuH^Rb;JoN>6*A3qvC{!n&`3kcnPBOX>0G7YZ8Qq z$L3+7L`?=_XQqYSKw{EYdIY(xi*Eb@6h0hn?{a& zG8G19+Akicr9#n_M*da?70$HlHeczX!sV{)l%hx~oJxK_#!sO_sfv-#<_;=M-50)A z_<;(P!jmO&m@hkT-nn86Do9tnezPNl3O(o7Ui25F!FE|`%f!1>5Lofu)BZgb6wmc+ z{V+>~eM;5aUeKvv*#1VK=`ML8tD4L3lhp zFRM+S3Mr@EyjjLnC|OhOdDRK~?<ntd*=vI^ zxu=y>RXqrgB2rej@dhEK$a%HFVk#t0TPd&JM}@mCm&7DLQ^DY$qWc@HXmE*oj%6H5 zgW$ODvGjv9*gMmDSUHadQrB(Ex60AsmA_()_f2QuD-Zf#(|_P$%Ai60o?J(GH49h(8;OIqHo z6lVfOIg+B{?IMMnHYhNJ>=pj^cvk(dan=59Ty;l^25wwi0l||L&3`*rd>)B(cs4+7 zGE9Il<7(+%mnRXncPyn>Zl6KvJ5usSou^QSP{h=!eQabE_2Y^wX#}O28mPD#(9nV8 zfRFaN1E_xgdew_YUFew(cSY;v&uHasxBl1jKcnw`+@a@J+L3_Is_!I+&nV{INMlWM z2O5-=^>ppsTT#&puHoD(pAhm&7jW9rj@-YT`5YY9fwRoFj1qUDDS)#3oVro%l{b7LU9Mg z&V{tH5V`Wn8xuYYT@ZYHW!J$G)NowEoZyTg`={HARvr9}-p$c5Eq(MGUGa%}u~~i; z?btLLlRIw|l?TS|drKKbE-kLBzsZgv1;ZOA_npR2nTon{RoNK2{>w|Wk~@aNV#<3= zgvU{E_E_2+y>Vok=seeP-#8L|x!=+wbsXuwqg0v~kE1(Q6V3}V#?f_C*%Zg9an!RQ zfZt^>j3@wZsLpM!VtJ>O(q0QJv0!lz^*0*ak)CPH_>EK@xLP8Ozfqi+zuR2aZxqw`OGS4Mwl`sg z?yMa}y@q--Az`D)wlJO1*))pSa#z<_tr|m3nR!#sc8sBk*_xQ_%rPVmucZwK$B^yC z=ly4O#!Jjy6C4pX`lK7$+Goy2p}lXUbC0r7 zoWiFlp)@=eJ~Y>%ij69IZgk|dvC#o<(Ea|4jiR0g`LF)QMh`^v6FR%th;-RIVeLmY zI#s89prDS8((Qe>_r7AI{J!5O6)M>1xuxLwm5EY2Ao%WTyUo?2g?y5-D4k zK506RHkRB|wVWD5zcWZ$s$FA9-R^K^Z08ti@ywaxw2vX~s}{au(->0ij$5{>e+=md zpq^tLW2h)=NY(et7%J2znf>}bhIBo7Pn2-q)wdj^md+nX(9)~of!AB>m4|(-+c+|) z3f6u{9!I%m*|#@b7)NWHQ$*Dtk0VVJi$YeNT%a@s(^eSM_7_PCUNHnDLS~U?YL~=c_V| z*obm2y7$9cHfsNV`Q_LeHrlCNGmmP*M%QG6=WoUSUx{tFz0eBxpXNoHod+8^<>)V& zCgVPeJJkOG#~c0A%_kXCdH+tGO~p9A9Rd|5_t~gp_ZwzNJsWL)S|2CZ&qe{s-O^Pf zY_vCSH1Vy>1Tyt%DA}Yqfr`q7YH1rM(4Ht?e%oeDr~US>_MAY~=Ui`u`%R#|%(3b3 z2@~k%zL1hHrzX&@BG)3@s}m@!(o!ws#sqR&DDhC_A(qp+`t#1y3H0IpOqWK(1WI~g zWu(_TfjEw<49`&~P^jJs-x%%$dTQ0VXpzJuI(PVWl+3b8v~S0zX7{C&$T?7GZn^0s zvUo0cKH6dui55;RJK!;i99v&?%X&_ty9SbDZAT|jyuE+%jkrlvT6^cc%-Knl>b^6| z@8Tp{ym8sC=8{Pidf{sDy2?qE@Xqhm?R%4`M=SAs@|#K2`K{kXfS5#s%WlSLcTA$L zT_*pOHccW%+%7qXL98bM>s&uiqOqQ(Vh2Vhk?DKI<3A=Qk!$ZzN0RImDmy;Ym#RF4 zcw@3Y{iajs?W=t4lKoR?tS!vY{lpadC2OH}vU&Kvr~aQJPl8wc6;@AW)@|F#rf3TPau;UH&8In{_C9Q0C6WJ%VlX=J{=@t}m$G&0pQ zx=y|}jndaz27GIoMs|l??zhO#pscUQBF8OfP|Jps0!z|nP{#ad_fld8&HD8hT%Dak zgluQGkl8F+n)Hql6F7?^585ms-JV6~Jd10q8)uP{wUL0;94>NP=N44Env1rj-Ex`@ z;387U#i^bnT$Fl@?`l!NMHV}go0e2@kx1-WL(vW{dP2`=Upm4?1(VFTb@O;gcV(@# zkTwr(@Ki;k+j;1~JeyqZVIJzpI)6I1kcW1--|FAf#zRk|NJ-N(JXAkD?D$ogkD|Js zcRVrUqX)~T_9b}n(XF~2fiDm7k!!Av&9_)S%9%-`GfMpk9HvYE9_Ai9mp0eR1 zWtCyew_1FZNz!l2GROSFh1OQae8ju`Ooy|Ij|^{`JzIrowj${teMvs5&ddB%F2qN3 zd@eTNfSVA-%6L9a2_2 zR4U`#)T+iq&5`N6VG$l$LyBC=?%^WIgntY|UU1Q{hi2pAVlGO{GJQKi=At`&Hrtb( zxJWcPCiT)vF0$S7X=j!I7cH&*DXR2)77@lXGpVh!=)x|8I*H0zo6PIVU1a*#y@nn5E+S?U(`GwAzAX5-ukGiYP|`@Y1pGpION zv_&{}26d*uxm#{Ch%+;)E4Y3J8D|*2{et^4RL*j>D`^I)7?*~McTb}VhDdko>uHo* z5$?aRbQ)bUc{TXy(ll~fv^3~&{50Bj>8dw3U>bcH&{N&!JdHAZ=3Csdm`0#}%g3(TUsLolOg;k+8uOFM7^2TJ`dpVKt9~ath+(uFY`J&7VvkJ0=Hl-K);% z406yYT9~u0hl4x?51%dSUc} z=R1~DUR0h{i05l9Z|TMiaL`zdamD9v9OO3oV#DRtWEThhqW#D^fcYP8g!^lJ z|LnZVLCp_srYZT@Py5@AE!Q|`EoYr@ zYcU52RK0+?7zSA$aAnV;XB-qiepxI5`@6m2dyO%U>!2LFaT|++WWFreSSvV zpe&w7)A?Ob%~nsNV{)CwH}Lr~H-1-x+o5T62IAK~%bG^~RU`6^mD9+hXw`<=o_(C` z7|lf|;p|9C5*MwteV2ar8isA$=06j9#6{|cbDm8$a1k;7)-8y@mH`Y7A1lZO;s7Qu_(JapZB!>(`%K3WsHy$Si+1;)UOFGU+2fpX=&Lr?rr|eO| z_2;qPUEOx$Qa<9QUNaqfz(*F}em@QWfbAMNm#Vb$k&~jdiV>BMtVVpcOpftU?5=yY z-XbJo`^tDZDQObXt#e~yEB;?sdo*IcrUr>HPf5ww)*%teZNaoUQxb7oJ|5V#B-f8bY!-SoZ_t-Si08n=ZcJtMtX6S7 zNJNQ}$s)D=BqCJ)!Ox{`BqIF!mRU7-5+TE@opWb5iAX3`PrZq0bxnfKVjNeL<(AP{ zOw;RwZJyxxd@q>!j9_ZuxcY_**0bq1o!;$9A_}c;CrkyCh*ZZfo2p|;gjp5m_<;-( zK_l1rMHgUuB6l`Etio~R7-_9Sg6Hq_<|6709p zc0NOb#4zoN{vr&+b*BjI{Coyer66s)pF+gSWe3!^Y!fD`@-OXoD;FlJc@`I%7K;!i zw#W0%WQq_vQ1i}TM3ku5>27P1EK108hV0vf#0bl^hV64w#0a?@wW-&0#RT_P+qaN=M47{{RG? z{{Vz^?OjEsi+{m>ZS4iyYI@;@?C^f7S~m!Mujpzx)(H*KXJ7e>euZ7)5=kp*kYtwK2f>aB##>e$^U>Z+Eh|SO;KEnR@O*$P+TKTGZ{xtZ~uA^s;p{}*) z)tTLfRQR11ws*HF6*i^waxUA@fl9WcnRd`&Nm*WRsmu^e8B5=Dn52W(!^;cn4ToUH zSK(&fE*j*IMbp^}Xi&aeV@XU} z`qjT{+G72U_AeDS(V*7W^V7gW8Wi0P?mX~=3OWZ23@?7B!Mj$e#w*`xa8O{w;yYhy zVC5t8P<$H=Dpa&}7iQAn#81`C4;?h<+R}gXLmdrTz1!JBeKhcN*n4VuB@I5!_8)U9 z$I#W1Bj>XWXmEG(>`2Q}8gPkxXR|Fduvs&F9({|=ce+-p@+i_*AFT;JXY=I((Dr%i*$-QBx>3DMxj*6A&|7-~DE$6VD` zMujw$lg_g_RM^lGo@XycgH@4_LQ2?FSTnWA;P)7Y>SPKYH)6Y5{k7ie<}|o^zM)DT zLuzNvC=~T$KY~7Q#lPctT)N1EXK@_s?LVpR4x+)@#OYk21YD0kouh(y{cE&#`)MW7 zpyk*@&1*3<@VW4uWSNEY`nq1ADV_!i{wf)oM`$4CvHFC|P8zU!75W$8xIfFpyfVUZ z4JaN6G3=zm(S}L77={jiw2nVG{Bn*k50>;@NuKUF`Z+FEd8i(Vyy6RJEq*aL~hG| zDy+YDBz%Pz6&Rr_4)!0WLfbx>kg`+^)hg^WMXn=lXO=x|DA0 zgZInMN&8`A6FNNa+MLwBgbry{$VdXrX?}RM7rqoNMQ&!?SpnQ51%`+@!OtYs9e{y zVZQ>dgL=E=jvOT#j1@`-S6k9x59wjo?LZpXJrDPMO`(BFP4TzNMC|{>ORcb28l=A1 z*2%&B8AXnB@xgV|8-L-C3TPl1a@u1zroknwt?R33;NQJPMLQ4g|KXyyYbx=%P*9ri z3eT5h85{N&;r?}V4ZQZ021cX*IPJyf&qos<*K>Hhk{@(1;xP@joXop&r3If$?iX&d z+Gudr{I$v@Cf*-OLN|(LXkdS>z&1se4o?S$6_l0epmgF^_phr-Q{t?c1h_bkH)&`;u@H*F)y&@Ei?)0-Zyj?Pp=Vff>^;W$r@ zeD1>INAa1w7AhTB}`qbieLm(61aBzXe5UhJFzI=`45S&ZP*?Ql72sU?U zJXP@@f;}z}n0&ObW@6_>0+>Pm;eoALDvvtkI`lp;58Xc~e!Ip~4gM{LKl zI@yFZ1V&FKUwMxX!J(Z|a9(B@3Z7g}mQ@~xmy7Q11N~t*S*?^9VLJ=~x%)JZdJIF3 z&cormM}|RD_2s4$hlU~E;KiUy>M#V}&trW$H4LF~UMtq$9|qa$)t*y#hCxk@9i@$5 zDIoLvk9xZCs|7Jl;ii#ckZ3Ny_fUWV*PGIArYbVv(bYD2vH=4`#ZQQDv}C~Ow>4o~ z$P8#nNSbg=WPsy@>h0Ax86Z@V_vv#Z1KRdWjkr%S;6%3Hi(y43EHhg0h_GjZUha;f zw@yqLcS^il9>WAXxny>~HxtZCy%xQVXM*et?)CI=ChQC{VQ)Fj1n!(3|1)Qp&^{I{ z!_8qrpK@KM>wPBBr)J(Nb}`}c=9;EEb6L>#57^4Xsz+6X*6x>|#;KLYQ` zS{@xMM?m_0(fd8CMj*{+vGzSo`ENTuQL+3bkq;V7?Gb3ZVtDR_`Un_DG|w*58iC?U z@rXUDBk*cf*`%KG2%vM^Vr#h(*wCJ3A&UJ`KR$>`k{N+#mJeNQF#qWs$#oNgBQW~r zWqD>d3nY%7WRsXIxO6FCiRu&!(hfa-UfRckkSc#Wr69n zC&#Z*SYWlLtKpG13p~zSi?2&$!Iv#37w4w1z$) z$H*61Q1W(Bu~rf0^VPWOH&~!o9P=#Y77NNBY@U|Kbv|}FwCYPHh8M6vyx42tHui6m+7>0B z!GcqgrsTe47Q|KT>~1}R$eLUJ#g7GS<7R&1E*8|v5&B>4 zu-v<-!%DkZ@J&wgR}U9cnFN0<}&0{760%j(iHp+jy3VUnwqXP{sR>$S@`^NMb^IeR$8NFedm`en^Hu zCj1`wAj;dt1o2yH4l&kD=<u$44GPRi?UGu^qL755P%C6^s#jhmI=gVbQ z$S`4FeZYYaGYqh2JpHzW!vN`?rf*tw28gFP7IpP8ARv8^wWX5*4r9eN(H|MGB6t4R z?f4wwzOt`aS;v3{&-4dwzhS`Tm~Xq?9^vz7xrSxUV+IhaDs&lw0iRM!4~{;^dTyJd zufJo!)BKcZCLSMhEh|dIb{=PKE|{!kz_xor+cskU;=(ks&-WNmQGcb!@Gb)=C%1pR zTg-so?Bf-=*$l8?i(To=#OK&X*|p&@3~-;W)#YH?lDGMHwQ^T-E?j2A1$1w1o7g`xN55sZmXH4CK^7lz~_C%swIm|74R!& znS#Au^BAyeefwp5EFb49z+8asOr*x$m^YsRWr9InF0u@GD>y1zkNwViAiu=_)K=yex zjzgg`SEvd1+wZlaOPh)q(Auy~C+jkPe%d_E&A@r6ZPhJ_KEr^8T$x9eNesBd6f5Cj z`bvqemy7cj7OXU}4Z!iZuYlFSRL*g z@h{cN_;V@EriVoCwc{BK=U}GiP7) zKfQ3o%JHdV@OMykT6i^1r3?SR;LfqI>;sB!Bt_5LMH&MLUQxb*p`Q4PnqQzlML(Xh z`tPkVKfg%Nh`=Lcih(Uj_z%)>I55OBh#cY{89-U{A1@#@>_48NElI#-4mRi)?iEb- zj0gzC7J?%vYwcX-{$K82{eRg;b}llQ9TAD8!n}MvgF}7Ely!e@@`RK9g2+CRl=XkE zNJMxAhXs*6!@VNOlnu6Od;i>N3n%|&Q;bvp2!(}{$-!X(UJ-#2l#Sc5-9Mi{`C)4~ z#Atj8&exk_VlDT_t#7DjL}*lquV+YTcre8@^-qYH85tfJjMK(p`j9D`PX7t#4-e~_ zS)00uV7pjy@X!y_o>Sp47XclcHyzTXiPLD_8W z@PF^)`2`1tP_}G0-|;^a|J~E`3-*ekSZp`1z4vFrMFNwEFtU%QzZX{6YAx`0Ndo;t zyn_D5zU}m%EU~@+m!;*-tN1?PpJGJB1_zUI20nkKx1atSf>+GnvDJ2Se68f)&mUJl zL4jd^Cw83vQ}c*OugC~oA6)aD7`P*m^n=O1fnNIL@KFEIApMYNTxooVB`QYW-#0uk znyl{-9(oYhE<)eQX}3PU`4|{Rj?l;dal*)Cp8)+xFQ3rJ@X(Mox?8te=tuZ?;lloL Tp%)SsOR Date: Mon, 21 Jun 2021 16:49:30 +0200 Subject: [PATCH 103/258] =?UTF-8?q?Fixup=20`trainer.py`=20=F0=9F=9B=A0?= =?UTF-8?q?=EF=B8=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TTS/trainer.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index 8b7be3d1..ec6d4417 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -462,12 +462,12 @@ class Trainer: update_lr_scheduler = True if self.use_amp_scaler: if self.use_apex: - with amp.scale_loss(loss_dict["loss"], self.optimizer) as scaled_loss: + with amp.scale_loss(loss_dict["loss"], optimizer) as scaled_loss: scaled_loss.backward() - grad_norm = torch.nn.utils.clip_grad_norm_( - amp.master_params(self.optimizer), - self.config.grad_clip, - ) + grad_norm = torch.nn.utils.clip_grad_norm_( + amp.master_params(optimizer), + grad_clip, + ) else: # model optimizer step in mixed precision mode scaler.scale(loss_dict["loss"]).backward() @@ -739,6 +739,7 @@ class Trainer: self.tb_logger.tb_eval_figures(self.total_steps_done, figures) if audios is not None: self.tb_logger.tb_eval_audios(self.total_steps_done, audios, self.ap.sample_rate) + self.tb_logger.tb_eval_stats(self.total_steps_done, self.keep_avg_eval.avg_values) def test_run(self) -> None: """Run test and log the results. Test run must be defined by the model. From ebb91c0fbb1fed99fc56415c45783fa0039f5441 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 21 Jun 2021 16:50:37 +0200 Subject: [PATCH 104/258] Move `TorchSTFT` to `utils.audio` --- TTS/utils/audio.py | 77 ++++++++++++++++++++++++++++++++++ TTS/vocoder/layers/losses.py | 80 +----------------------------------- 2 files changed, 79 insertions(+), 78 deletions(-) diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index 222b4c74..e1913e98 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -3,12 +3,89 @@ import numpy as np import scipy.io.wavfile import scipy.signal import soundfile as sf +import torch +from torch import nn from TTS.tts.utils.data import StandardScaler # import pyworld as pw +class TorchSTFT(nn.Module): # pylint: disable=abstract-method + """TODO: Merge this with audio.py""" + + def __init__( + self, + n_fft, + hop_length, + win_length, + pad_wav=False, + window="hann_window", + sample_rate=None, + mel_fmin=0, + mel_fmax=None, + n_mels=80, + use_mel=False, + ): + super().__init__() + self.n_fft = n_fft + self.hop_length = hop_length + self.win_length = win_length + self.pad_wav = pad_wav + self.sample_rate = sample_rate + self.mel_fmin = mel_fmin + self.mel_fmax = mel_fmax + self.n_mels = n_mels + self.use_mel = use_mel + self.window = nn.Parameter(getattr(torch, window)(win_length), requires_grad=False) + self.mel_basis = None + if use_mel: + self._build_mel_basis() + + def __call__(self, x): + """Compute spectrogram frames by torch based stft. + + Args: + x (Tensor): input waveform + + Returns: + Tensor: spectrogram frames. + + Shapes: + x: [B x T] or [B x 1 x T] + """ + if x.ndim == 2: + x = x.unsqueeze(1) + if self.pad_wav: + padding = int((self.n_fft - self.hop_length) / 2) + x = torch.nn.functional.pad(x, (padding, padding), mode="reflect") + # B x D x T x 2 + o = torch.stft( + x.squeeze(1), + self.n_fft, + self.hop_length, + self.win_length, + self.window, + center=True, + pad_mode="reflect", # compatible with audio.py + normalized=False, + onesided=True, + return_complex=False, + ) + M = o[:, :, :, 0] + P = o[:, :, :, 1] + S = torch.sqrt(torch.clamp(M ** 2 + P ** 2, min=1e-8)) + if self.use_mel: + S = torch.matmul(self.mel_basis.to(x), S) + return S + + def _build_mel_basis(self): + mel_basis = librosa.filters.mel( + self.sample_rate, self.n_fft, n_mels=self.n_mels, fmin=self.mel_fmin, fmax=self.mel_fmax + ) + self.mel_basis = torch.from_numpy(mel_basis).float() + + # pylint: disable=too-many-public-methods class AudioProcessor(object): """Audio Processor for TTS used by all the data pipelines. diff --git a/TTS/vocoder/layers/losses.py b/TTS/vocoder/layers/losses.py index 9acdeea1..848e292b 100644 --- a/TTS/vocoder/layers/losses.py +++ b/TTS/vocoder/layers/losses.py @@ -1,88 +1,12 @@ from typing import Dict, Union -import librosa import torch from torch import nn from torch.nn import functional as F +from TTS.utils.audio import TorchSTFT from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss - -class TorchSTFT(nn.Module): # pylint: disable=abstract-method - """TODO: Merge this with audio.py""" - - def __init__( - self, - n_fft, - hop_length, - win_length, - pad_wav=False, - window="hann_window", - sample_rate=None, - mel_fmin=0, - mel_fmax=None, - n_mels=80, - use_mel=False, - ): - super().__init__() - self.n_fft = n_fft - self.hop_length = hop_length - self.win_length = win_length - self.pad_wav = pad_wav - self.sample_rate = sample_rate - self.mel_fmin = mel_fmin - self.mel_fmax = mel_fmax - self.n_mels = n_mels - self.use_mel = use_mel - self.window = nn.Parameter(getattr(torch, window)(win_length), requires_grad=False) - self.mel_basis = None - if use_mel: - self._build_mel_basis() - - def __call__(self, x): - """Compute spectrogram frames by torch based stft. - - Args: - x (Tensor): input waveform - - Returns: - Tensor: spectrogram frames. - - Shapes: - x: [B x T] or [B x 1 x T] - """ - if x.ndim == 2: - x = x.unsqueeze(1) - if self.pad_wav: - padding = int((self.n_fft - self.hop_length) / 2) - x = torch.nn.functional.pad(x, (padding, padding), mode="reflect") - # B x D x T x 2 - o = torch.stft( - x.squeeze(1), - self.n_fft, - self.hop_length, - self.win_length, - self.window, - center=True, - pad_mode="reflect", # compatible with audio.py - normalized=False, - onesided=True, - return_complex=False, - ) - M = o[:, :, :, 0] - P = o[:, :, :, 1] - S = torch.sqrt(torch.clamp(M ** 2 + P ** 2, min=1e-8)) - if self.use_mel: - S = torch.matmul(self.mel_basis.to(x), S) - return S - - def _build_mel_basis(self): - mel_basis = librosa.filters.mel( - self.sample_rate, self.n_fft, n_mels=self.n_mels, fmin=self.mel_fmin, fmax=self.mel_fmax - ) - self.mel_basis = torch.from_numpy(mel_basis).float() - - ################################# # GENERATOR LOSSES ################################# @@ -275,7 +199,7 @@ def _apply_D_loss(scores_fake, scores_real, loss_func): loss += total_loss real_loss += real_loss fake_loss += fake_loss - # normalize loss values with number of scales + # normalize loss values with number of scales (discriminators) loss /= len(scores_fake) real_loss /= len(scores_real) fake_loss /= len(scores_fake) From 6f739ea07ae543f5d80f4a145b6126f402b6c7b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 21 Jun 2021 16:51:28 +0200 Subject: [PATCH 105/258] =?UTF-8?q?Fix=20`eval=5Flog`=20for=20`gan.py`=20?= =?UTF-8?q?=F0=9F=9B=A0=EF=B8=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TTS/trainer.py | 2 +- TTS/vocoder/models/gan.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index ec6d4417..f628d9a4 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -36,7 +36,7 @@ from TTS.utils.generic_utils import ( ) from TTS.utils.io import copy_model_files, save_best_model, save_checkpoint from TTS.utils.logging import ConsoleLogger, TensorboardLogger -from TTS.utils.trainer_utils import * +from TTS.utils.trainer_utils import get_optimizer, get_scheduler, is_apex_available, setup_torch_training_env from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data from TTS.vocoder.models import setup_model as setup_vocoder_model diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py index 58d6532e..94583147 100644 --- a/TTS/vocoder/models/gan.py +++ b/TTS/vocoder/models/gan.py @@ -144,20 +144,24 @@ class GAN(BaseVocoder): return outputs, loss_dict - def train_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + @staticmethod + def _log(name: str, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: y_hat = outputs[0]["model_outputs"] y = batch["waveform"] - figures = plot_results(y_hat, y, ap, "train") + figures = plot_results(y_hat, y, ap, name) sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() - audios = {"train/audio": sample_voice} + audios = {f"{name}/audio": sample_voice} return figures, audios + def train_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + return self._log("train", ap, batch, outputs) + @torch.no_grad() def eval_step(self, batch: Dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]: return self.train_step(batch, criterion, optimizer_idx) def eval_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: - return self.train_log(ap, batch, outputs) + return self._log("eval", ap, batch, outputs) def load_checkpoint( self, From 61c3cb871f535c96d66b17085cfa6f7d1483ccb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 21 Jun 2021 16:53:19 +0200 Subject: [PATCH 106/258] =?UTF-8?q?Docstring=20edit=20in=20`TTSDataset.py`?= =?UTF-8?q?=20=E2=9C=8D=EF=B8=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TTS/tts/datasets/TTSDataset.py | 131 +++++++++++++++++++++------------ 1 file changed, 82 insertions(+), 49 deletions(-) diff --git a/TTS/tts/datasets/TTSDataset.py b/TTS/tts/datasets/TTSDataset.py index d0fbb553..0fc23231 100644 --- a/TTS/tts/datasets/TTSDataset.py +++ b/TTS/tts/datasets/TTSDataset.py @@ -2,6 +2,7 @@ import collections import os import random from multiprocessing import Pool +from typing import Dict, List import numpy as np import torch @@ -10,52 +11,82 @@ from torch.utils.data import Dataset from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor from TTS.tts.utils.text import pad_with_eos_bos, phoneme_to_sequence, text_to_sequence +from TTS.utils.audio import AudioProcessor class TTSDataset(Dataset): def __init__( self, - outputs_per_step, - text_cleaner, - compute_linear_spec, - ap, - meta_data, - tp=None, - add_blank=False, - batch_group_size=0, - min_seq_len=0, - max_seq_len=float("inf"), - use_phonemes=False, - phoneme_cache_path=None, - phoneme_language="en-us", - enable_eos_bos=False, - speaker_id_mapping=None, - d_vector_mapping=None, - use_noise_augment=False, - verbose=False, + outputs_per_step: int, + text_cleaner: list, + compute_linear_spec: bool, + ap: AudioProcessor, + meta_data: List[List], + characters: Dict = None, + add_blank: bool = False, + batch_group_size: int = 0, + min_seq_len: int = 0, + max_seq_len: int = float("inf"), + use_phonemes: bool = False, + phoneme_cache_path: str = None, + phoneme_language: str = "en-us", + enable_eos_bos: bool = False, + speaker_id_mapping: Dict = None, + d_vector_mapping: Dict = None, + use_noise_augment: bool = False, + verbose: bool = False, ): - """ + """Generic 📂 data loader for `tts` models. It is configurable for different outputs and needs. + + If you need something different, you can either override or create a new class as the dataset is + initialized by the model. + Args: - outputs_per_step (int): number of time frames predicted per step. - text_cleaner (str): text cleaner used for the dataset. + outputs_per_step (int): Number of time frames predicted per step. + + text_cleaner (list): List of text cleaners to clean the input text before converting to sequence IDs. + compute_linear_spec (bool): compute linear spectrogram if True. - ap (TTS.tts.utils.AudioProcessor): audio processor object. - meta_data (list): list of dataset instances. - tp (dict): dict of custom text characters used for converting texts to sequences. - batch_group_size (int): (0) range of batch randomization after sorting - sequences by length. - min_seq_len (int): (0) minimum sequence length to be processed - by the loader. - max_seq_len (int): (float("inf")) maximum sequence length. - use_phonemes (bool): (true) if true, text converted to phonemes. - phoneme_cache_path (str): path to cache phoneme features. - phoneme_language (str): one the languages from - https://github.com/bootphon/phonemizer#languages - enable_eos_bos (bool): enable end of sentence and beginning of sentences characters. - speaker_id_mapping (dict): list of speaker ids to map speaker names to numerical ids. - d_vector_mapping (dict): dictionary of d-vectors that maps each audio file to a pre-computed d-vector. - use_noise_augment (bool): enable adding random noise to wav for augmentation. - verbose (bool): print diagnostic information. + + ap (TTS.tts.utils.AudioProcessor): Audio processor object. + + meta_data (list): List of dataset instances. + + characters (dict): `dict` of custom text characters used for converting texts to sequences. + + add_blank (bool): Add a special `blank` character after every other character. It helps some + models achieve better results. Defaults to false. + + batch_group_size (int): Range of batch randomization after sorting + sequences by length. It shuffles each batch with bucketing to gather similar lenght sequences in a + batch. Set 0 to disable. Defaults to 0. + + min_seq_len (int): Minimum input sequence length to be processed + by the loader. Filter out input sequences that are shorter than this. Some models have a + minimum input length due to its architecture. Defaults to 0. + + max_seq_len (int): Maximum input sequence length. Filter out input sequences that are longer than this. + It helps for controlling the VRAM usage against long input sequences. Especially models with + RNN layers are sensitive to input length. Defaults to `Inf`. + + use_phonemes (bool): If true, input text converted to phonemes. Defaults to false. + + phoneme_cache_path (str): Path to cache phoneme features. It writes computed phonemes to files to use in + the coming iterations. Defaults to None. + + phoneme_language (str): One the languages from supported by the phonemizer interface. Defaults to `en-us`. + + enable_eos_bos (bool): Enable the `end of sentence` and the `beginning of sentences characters`. Defaults + to False. + + speaker_id_mapping (dict): Mapping of speaker names to IDs used to compute embedding vectors by the + embedding layer. Defaults to None. + + d_vector_mapping (dict): Mapping of wav files to computed d-vectors. Defaults to None. + + use_noise_augment (bool): Enable adding random noise to wav for augmentation. Defaults to False. + + verbose (bool): Print diagnostic information. Defaults to false. """ super().__init__() self.batch_group_size = batch_group_size @@ -67,7 +98,7 @@ class TTSDataset(Dataset): self.min_seq_len = min_seq_len self.max_seq_len = max_seq_len self.ap = ap - self.tp = tp + self.characters = characters self.add_blank = add_blank self.use_phonemes = use_phonemes self.phoneme_cache_path = phoneme_cache_path @@ -97,13 +128,13 @@ class TTSDataset(Dataset): return data @staticmethod - def _generate_and_cache_phoneme_sequence(text, cache_path, cleaners, language, tp, add_blank): + def _generate_and_cache_phoneme_sequence(text, cache_path, cleaners, language, characters, add_blank): """generate a phoneme sequence from text. since the usage is for subsequent caching, we never add bos and eos chars here. Instead we add those dynamically later; based on the config option.""" phonemes = phoneme_to_sequence( - text, [cleaners], language=language, enable_eos_bos=False, tp=tp, add_blank=add_blank + text, [cleaners], language=language, enable_eos_bos=False, tp=characters, add_blank=add_blank ) phonemes = np.asarray(phonemes, dtype=np.int32) np.save(cache_path, phonemes) @@ -111,7 +142,7 @@ class TTSDataset(Dataset): @staticmethod def _load_or_generate_phoneme_sequence( - wav_file, text, phoneme_cache_path, enable_eos_bos, cleaners, language, tp, add_blank + wav_file, text, phoneme_cache_path, enable_eos_bos, cleaners, language, characters, add_blank ): file_name = os.path.splitext(os.path.basename(wav_file))[0] @@ -122,15 +153,15 @@ class TTSDataset(Dataset): phonemes = np.load(cache_path) except FileNotFoundError: phonemes = TTSDataset._generate_and_cache_phoneme_sequence( - text, cache_path, cleaners, language, tp, add_blank + text, cache_path, cleaners, language, characters, add_blank ) except (ValueError, IOError): print(" [!] failed loading phonemes for {}. " "Recomputing.".format(wav_file)) phonemes = TTSDataset._generate_and_cache_phoneme_sequence( - text, cache_path, cleaners, language, tp, add_blank + text, cache_path, cleaners, language, characters, add_blank ) if enable_eos_bos: - phonemes = pad_with_eos_bos(phonemes, tp=tp) + phonemes = pad_with_eos_bos(phonemes, tp=characters) phonemes = np.asarray(phonemes, dtype=np.int32) return phonemes @@ -158,13 +189,14 @@ class TTSDataset(Dataset): self.enable_eos_bos, self.cleaners, self.phoneme_language, - self.tp, + self.characters, self.add_blank, ) else: text = np.asarray( - text_to_sequence(text, [self.cleaners], tp=self.tp, add_blank=self.add_blank), dtype=np.int32 + text_to_sequence(text, [self.cleaners], tp=self.characters, add_blank=self.add_blank), + dtype=np.int32, ) assert text.size > 0, self.items[idx][1] @@ -206,7 +238,8 @@ class TTSDataset(Dataset): for idx, item in enumerate(tqdm.tqdm(self.items)): text, *_ = item sequence = np.asarray( - text_to_sequence(text, [self.cleaners], tp=self.tp, add_blank=self.add_blank), dtype=np.int32 + text_to_sequence(text, [self.cleaners], tp=self.characters, add_blank=self.add_blank), + dtype=np.int32, ) self.items[idx][0] = sequence @@ -216,7 +249,7 @@ class TTSDataset(Dataset): self.enable_eos_bos, self.cleaners, self.phoneme_language, - self.tp, + self.characters, self.add_blank, ] if self.verbose: From af12925142cd506c25c22b9cfb10be9726f2a33d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 21 Jun 2021 17:23:24 +0200 Subject: [PATCH 107/258] =?UTF-8?q?Add=20AlignTTS=20recipe=20=F0=9F=91=A9?= =?UTF-8?q?=E2=80=8D=F0=9F=8D=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- recipes/ljspeech/align_tts/train_aligntts.py | 30 ++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 recipes/ljspeech/align_tts/train_aligntts.py diff --git a/recipes/ljspeech/align_tts/train_aligntts.py b/recipes/ljspeech/align_tts/train_aligntts.py new file mode 100644 index 00000000..4a4f86c4 --- /dev/null +++ b/recipes/ljspeech/align_tts/train_aligntts.py @@ -0,0 +1,30 @@ +import os + +from TTS.tts.configs import AlignTTSConfig +from TTS.tts.configs import BaseDatasetConfig +from TTS.trainer import init_training, Trainer, TrainingArgs + + +output_path = os.path.dirname(os.path.abspath(__file__)) +dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")) +config = AlignTTSConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + text_cleaner="english_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + print_step=25, + print_eval=True, + mixed_precision=False, + output_path=output_path, + datasets=[dataset_config] +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger) +trainer.fit() From 0ff2d2336aebcb7df300f3225d570e137c6d37d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 22 Jun 2021 03:03:30 +0200 Subject: [PATCH 108/258] =?UTF-8?q?Fix=20wrong=20argument=20name=20?= =?UTF-8?q?=F0=9F=9B=A0=EF=B8=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TTS/bin/compute_attention_masks.py | 2 +- TTS/bin/extract_tts_spectrograms.py | 2 +- TTS/tts/models/base_tts.py | 2 +- notebooks/ExtractTTSpectrogram.ipynb | 2 +- tests/data_tests/test_loader.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index 35721f59..cbb441fe 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -89,7 +89,7 @@ Example run: compute_linear_spec=False, ap=ap, meta_data=meta_data, - tp=C.characters if "characters" in C.keys() else None, + characters=c.characters if "characters" in C.keys() else None, add_blank=C["add_blank"] if "add_blank" in C.keys() else False, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 11cdfe31..b0159b86 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -27,7 +27,7 @@ def setup_loader(ap, r, verbose=False): compute_linear_spec=False, meta_data=meta_data, ap=ap, - tp=c.characters if "characters" in c.keys() else None, + characters=c.characters if "characters" in c.keys() else None, add_blank=c["add_blank"] if "add_blank" in c.keys() else False, batch_group_size=0, min_seq_len=c.min_seq_len, diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 1de7ba92..015d0200 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -164,7 +164,7 @@ class BaseTTS(BaseModel): compute_linear_spec=config.model.lower() == "tacotron", meta_data=data_items, ap=ap, - tp=config.characters, + characters=config.characters, add_blank=config["add_blank"], batch_group_size=0 if is_eval else config.batch_group_size * config.batch_size, min_seq_len=config.min_seq_len, diff --git a/notebooks/ExtractTTSpectrogram.ipynb b/notebooks/ExtractTTSpectrogram.ipynb index bdc7c955..4e42a3bb 100644 --- a/notebooks/ExtractTTSpectrogram.ipynb +++ b/notebooks/ExtractTTSpectrogram.ipynb @@ -112,7 +112,7 @@ "preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n", "preprocessor = getattr(preprocessor, DATASET.lower())\n", "meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n", - "dataset = TTSDataset(checkpoint['r'], C.text_cleaner, False, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n", + "dataset = TTSDataset(checkpoint['r'], C.text_cleaner, False, ap, meta_data,characters=c.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n", "loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False)" ] }, diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index cad89d09..9bc70ddd 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -44,7 +44,7 @@ class TestTTSDataset(unittest.TestCase): compute_linear_spec=True, ap=self.ap, meta_data=items, - tp=c.characters, + characters=c.characters, batch_group_size=bgs, min_seq_len=c.min_seq_len, max_seq_len=float("inf"), From 18e5393f1637d2052ce94b6b88cdc866d81bc86d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 22 Jun 2021 21:39:17 +0200 Subject: [PATCH 109/258] =?UTF-8?q?Add=20=F0=9F=90=8D=20python=203.9=20to?= =?UTF-8?q?=20CI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/main.yml | 4 ++-- TTS/bin/compute_attention_masks.py | 2 +- TTS/bin/distribute.py | 1 - setup.py | 3 ++- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 74d5e85b..68be9274 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -18,8 +18,8 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.6, 3.7, 3.8] - + python-version: [3.6, 3.7, 3.8, 3.9] + experimental: [false] steps: - uses: actions/checkout@v2 - uses: actions/cache@v1 diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index cbb441fe..88d60d7d 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -89,7 +89,7 @@ Example run: compute_linear_spec=False, ap=ap, meta_data=meta_data, - characters=c.characters if "characters" in C.keys() else None, + characters=C.characters if "characters" in C.keys() else None, add_blank=C["add_blank"] if "add_blank" in C.keys() else False, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, diff --git a/TTS/bin/distribute.py b/TTS/bin/distribute.py index 873ddb1f..742c0197 100644 --- a/TTS/bin/distribute.py +++ b/TTS/bin/distribute.py @@ -18,7 +18,6 @@ def main(): parser = TrainingArgs().init_argparse(arg_prefix="") parser.add_argument("--script", type=str, help="Target training script to distibute.") args, unargs = parser.parse_known_args() - breakpoint() num_gpus = torch.cuda.device_count() group_id = time.strftime("%Y_%m_%d-%H%M%S") diff --git a/setup.py b/setup.py index b4015455..bd6a6aae 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ import setuptools.command.develop from Cython.Build import cythonize from setuptools import Extension, find_packages, setup -if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) > LooseVersion("3.9"): +if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) > LooseVersion("3.10"): raise RuntimeError("TTS requires python >= 3.6 and <=3.10 " "but your Python version is {}".format(sys.version)) @@ -106,6 +106,7 @@ setup( "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", "Development Status :: 3 - Alpha", "Intended Audience :: Science/Research", "Intended Audience :: Developers", From aba840b4e676c9d01202d2ba72505e62fff0560b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 23 Jun 2021 11:08:34 +0200 Subject: [PATCH 110/258] =?UTF-8?q?Fix=20loading=20the=20`amp`=20scaler=20?= =?UTF-8?q?from=20a=20checkpoint=20=F0=9F=9B=A0=EF=B8=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TTS/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index f628d9a4..d5aec1c9 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -306,7 +306,7 @@ class Trainer: model.load_state_dict(checkpoint["model"]) print(" > Restoring Optimizer...") optimizer = _restore_list_objs(checkpoint["optimizer"], optimizer) - if "scaler" in checkpoint and self.use_amp_scaler: + if "scaler" in checkpoint and self.use_amp_scaler and checkpoint["scaler"]: print(" > Restoring AMP Scaler...") scaler = _restore_list_objs(checkpoint["scaler"], scaler) except (KeyError, RuntimeError): From 19a2cdc1e5da5761054c2c7f7ee1983e4730f06e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 23 Jun 2021 11:42:11 +0200 Subject: [PATCH 111/258] Update `umap` and `numba` vers. to 0.5.1 and 0.53 --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index bc69481a..8e32a373 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,8 +15,8 @@ soundfile tensorboardX torch>=1.7 tqdm -numba==0.52 -umap-learn==0.4.6 +numba==0.53 +umap-learn==0.5.1 anyascii coqpit # japanese g2p deps From 64fd59204c18c8f935831e39e213c6cbe7ef5576 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 23 Jun 2021 13:45:59 +0200 Subject: [PATCH 112/258] Use `torch.linalg.qr` for pytorch > `v1.9.0` --- TTS/tts/layers/glow_tts/glow.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/TTS/tts/layers/glow_tts/glow.py b/TTS/tts/layers/glow_tts/glow.py index 18c491e3..7620ef88 100644 --- a/TTS/tts/layers/glow_tts/glow.py +++ b/TTS/tts/layers/glow_tts/glow.py @@ -1,3 +1,5 @@ +from distutils.version import LooseVersion + import torch from torch import nn from torch.nn import functional as F @@ -81,7 +83,11 @@ class InvConvNear(nn.Module): self.no_jacobian = no_jacobian self.weight_inv = None - w_init = torch.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_())[0] + if LooseVersion(torch.__version__) < LooseVersion("1.9"): + w_init = torch.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_())[0] + else: + w_init = torch.linalg.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_(), "complete")[0] + if torch.det(w_init) < 0: w_init[:, 0] = -1 * w_init[:, 0] self.weight = nn.Parameter(w_init) From 788992093d7451dceb8e2acb84c00515cbf94b4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 21 Jun 2021 17:49:08 +0200 Subject: [PATCH 113/258] =?UTF-8?q?Add=20UnivNet=20vocoder=20=20?= =?UTF-8?q?=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 1 + TTS/vocoder/configs/univnet_config.py | 160 ++++++++++++++++ TTS/vocoder/layers/lvc_block.py | 198 ++++++++++++++++++++ TTS/vocoder/models/__init__.py | 6 + TTS/vocoder/models/univnet_discriminator.py | 96 ++++++++++ TTS/vocoder/models/univnet_generator.py | 145 ++++++++++++++ recipes/ljspeech/univnet/train.py | 29 +++ 7 files changed, 635 insertions(+) create mode 100644 TTS/vocoder/configs/univnet_config.py create mode 100644 TTS/vocoder/layers/lvc_block.py create mode 100644 TTS/vocoder/models/univnet_discriminator.py create mode 100644 TTS/vocoder/models/univnet_generator.py create mode 100644 recipes/ljspeech/univnet/train.py diff --git a/README.md b/README.md index 92c2ee52..e2a33e15 100644 --- a/README.md +++ b/README.md @@ -98,6 +98,7 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models - WaveRNN: [origin](https://github.com/fatchord/WaveRNN/) - WaveGrad: [paper](https://arxiv.org/abs/2009.00713) - HiFiGAN: [paper](https://arxiv.org/abs/2010.05646) +- UnivNet: [paper](https://arxiv.org/pdf/2106.07889.pdf) You can also help us implement more models. Some 🐸TTS related work can be found [here](https://github.com/erogol/TTS-papers). diff --git a/TTS/vocoder/configs/univnet_config.py b/TTS/vocoder/configs/univnet_config.py new file mode 100644 index 00000000..85662831 --- /dev/null +++ b/TTS/vocoder/configs/univnet_config.py @@ -0,0 +1,160 @@ +from dataclasses import dataclass, field + +from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig + + +@dataclass +class UnivnetConfig(BaseGANVocoderConfig): + """Defines parameters for UnivNet vocoder. + + Example: + + >>> from TTS.vocoder.configs import UnivNetConfig + >>> config = UnivNetConfig() + + Args: + model (str): + Model name used for selecting the right model at initialization. Defaults to `UnivNet`. + discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to + 'UnivNet_discriminator`. + generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is + considered as a generator too. Defaults to `UnivNet_generator`. + generator_model_params (dict): Parameters of the generator model. Defaults to + ` + { + "use_mel": True, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": None, + } + ` + batch_size (int): + Batch size used at training. Larger values use more memory. Defaults to 32. + seq_len (int): + Audio segment length used at training. Larger values use more memory. Defaults to 8192. + pad_short (int): + Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0. + use_noise_augment (bool): + enable / disable random noise added to the input waveform. The noise is added after computing the + features. Defaults to True. + use_cache (bool): + enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is + not large enough. Defaults to True. + use_stft_loss (bool): + enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True. + use_subband_stft (bool): + enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True. + use_mse_gan_loss (bool): + enable / disable using Mean Squeare Error GAN loss. Defaults to True. + use_hinge_gan_loss (bool): + enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models. + Defaults to False. + use_feat_match_loss (bool): + enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True. + use_l1_spec_loss (bool): + enable / disable using L1 spectrogram loss originally used by univnet model. Defaults to False. + stft_loss_params (dict): + STFT loss parameters. Default to + `{ + "n_ffts": [1024, 2048, 512], + "hop_lengths": [120, 240, 50], + "win_lengths": [600, 1200, 240] + }` + l1_spec_loss_params (dict): + L1 spectrogram loss parameters. Default to + `{ + "use_mel": True, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": None, + }` + stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total + model loss. Defaults to 0.5. + subband_stft_loss_weight (float): + Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. + mse_G_loss_weight (float): + MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5. + hinge_G_loss_weight (float): + Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. + feat_match_loss_weight (float): + Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108. + l1_spec_loss_weight (float): + L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. + """ + + model: str = "univnet" + batch_size: int = 32 + # model specific params + discriminator_model: str = "univnet_discriminator" + generator_model: str = "univnet_generator" + generator_model_params: dict = field( + default_factory=lambda: { + "in_channels": 64, + "out_channels": 1, + "hidden_channels": 32, + "cond_channels": 80, + "upsample_factors": [8, 8, 4], + "lvc_layers_each_block": 4, + "lvc_kernel_size": 3, + "kpnet_hidden_channels": 64, + "kpnet_conv_size": 3, + "dropout": 0.0, + } + ) + + # LOSS PARAMETERS - overrides + use_stft_loss: bool = True + use_subband_stft_loss: bool = False + use_mse_gan_loss: bool = True + use_hinge_gan_loss: bool = False + use_feat_match_loss: bool = False # requires MelGAN Discriminators (MelGAN and univnet) + use_l1_spec_loss: bool = False + + # loss weights - overrides + stft_loss_weight: float = 2.5 + stft_loss_params: dict = field( + default_factory=lambda: { + "n_ffts": [1024, 2048, 512], + "hop_lengths": [120, 240, 50], + "win_lengths": [600, 1200, 240], + } + ) + subband_stft_loss_weight: float = 0 + mse_G_loss_weight: float = 1 + hinge_G_loss_weight: float = 0 + feat_match_loss_weight: float = 0 + l1_spec_loss_weight: float = 0 + l1_spec_loss_params: dict = field( + default_factory=lambda: { + "use_mel": True, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": None, + } + ) + + # optimizer parameters + lr_gen: float = 1e-4 # Initial learning rate. + lr_disc: float = 1e-4 # Initial learning rate. + lr_scheduler_gen: str = None # one of the schedulers from https:#pytorch.org/docs/stable/optim.html + # lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1}) + lr_scheduler_disc: str = None # one of the schedulers from https:#pytorch.org/docs/stable/optim.html + # lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1}) + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.5, 0.9], "weight_decay": 0.0}) + steps_to_start_discriminator: int = 200000 + + def __post_init__(self): + super().__post_init__() + self.generator_model_params["cond_channels"] = self.audio.num_mels diff --git a/TTS/vocoder/layers/lvc_block.py b/TTS/vocoder/layers/lvc_block.py new file mode 100644 index 00000000..0e29ee3c --- /dev/null +++ b/TTS/vocoder/layers/lvc_block.py @@ -0,0 +1,198 @@ +import torch +import torch.nn.functional as F + + +class KernelPredictor(torch.nn.Module): + """Kernel predictor for the location-variable convolutions""" + + def __init__( # pylint: disable=dangerous-default-value + self, + cond_channels, + conv_in_channels, + conv_out_channels, + conv_layers, + conv_kernel_size=3, + kpnet_hidden_channels=64, + kpnet_conv_size=3, + kpnet_dropout=0.0, + kpnet_nonlinear_activation="LeakyReLU", + kpnet_nonlinear_activation_params={"negative_slope": 0.1}, + ): + """ + Args: + cond_channels (int): number of channel for the conditioning sequence, + conv_in_channels (int): number of channel for the input sequence, + conv_out_channels (int): number of channel for the output sequence, + conv_layers (int): + kpnet_ + """ + super().__init__() + + self.conv_in_channels = conv_in_channels + self.conv_out_channels = conv_out_channels + self.conv_kernel_size = conv_kernel_size + self.conv_layers = conv_layers + + l_w = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers + l_b = conv_out_channels * conv_layers + + padding = (kpnet_conv_size - 1) // 2 + self.input_conv = torch.nn.Sequential( + torch.nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=(5 - 1) // 2, bias=True), + getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + ) + + self.residual_conv = torch.nn.Sequential( + torch.nn.Dropout(kpnet_dropout), + torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), + getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), + getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + torch.nn.Dropout(kpnet_dropout), + torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), + getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), + getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + torch.nn.Dropout(kpnet_dropout), + torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), + getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), + getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + ) + + self.kernel_conv = torch.nn.Conv1d(kpnet_hidden_channels, l_w, kpnet_conv_size, padding=padding, bias=True) + self.bias_conv = torch.nn.Conv1d(kpnet_hidden_channels, l_b, kpnet_conv_size, padding=padding, bias=True) + + def forward(self, c): + """ + Args: + c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) + Returns: + """ + batch, _, cond_length = c.shape + + c = self.input_conv(c) + c = c + self.residual_conv(c) + k = self.kernel_conv(c) + b = self.bias_conv(c) + + kernels = k.contiguous().view( + batch, self.conv_layers, self.conv_in_channels, self.conv_out_channels, self.conv_kernel_size, cond_length + ) + bias = b.contiguous().view(batch, self.conv_layers, self.conv_out_channels, cond_length) + return kernels, bias + + +class LVCBlock(torch.nn.Module): + """the location-variable convolutions""" + + def __init__( + self, + in_channels, + cond_channels, + upsample_ratio, + conv_layers=4, + conv_kernel_size=3, + cond_hop_length=256, + kpnet_hidden_channels=64, + kpnet_conv_size=3, + kpnet_dropout=0.0, + ): + super().__init__() + + self.cond_hop_length = cond_hop_length + self.conv_layers = conv_layers + self.conv_kernel_size = conv_kernel_size + self.convs = torch.nn.ModuleList() + + self.upsample = torch.nn.ConvTranspose1d( + in_channels, + in_channels, + kernel_size=upsample_ratio * 2, + stride=upsample_ratio, + padding=upsample_ratio // 2 + upsample_ratio % 2, + output_padding=upsample_ratio % 2, + ) + + self.kernel_predictor = KernelPredictor( + cond_channels=cond_channels, + conv_in_channels=in_channels, + conv_out_channels=2 * in_channels, + conv_layers=conv_layers, + conv_kernel_size=conv_kernel_size, + kpnet_hidden_channels=kpnet_hidden_channels, + kpnet_conv_size=kpnet_conv_size, + kpnet_dropout=kpnet_dropout, + ) + + for i in range(conv_layers): + padding = (3 ** i) * int((conv_kernel_size - 1) / 2) + conv = torch.nn.Conv1d( + in_channels, in_channels, kernel_size=conv_kernel_size, padding=padding, dilation=3 ** i + ) + + self.convs.append(conv) + + def forward(self, x, c): + """forward propagation of the location-variable convolutions. + Args: + x (Tensor): the input sequence (batch, in_channels, in_length) + c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) + + Returns: + Tensor: the output sequence (batch, in_channels, in_length) + """ + in_channels = x.shape[1] + kernels, bias = self.kernel_predictor(c) + + x = F.leaky_relu(x, 0.2) + x = self.upsample(x) + + for i in range(self.conv_layers): + y = F.leaky_relu(x, 0.2) + y = self.convs[i](y) + y = F.leaky_relu(y, 0.2) + + k = kernels[:, i, :, :, :, :] + b = bias[:, i, :, :] + y = self.location_variable_convolution(y, k, b, 1, self.cond_hop_length) + x = x + torch.sigmoid(y[:, :in_channels, :]) * torch.tanh(y[:, in_channels:, :]) + return x + + @staticmethod + def location_variable_convolution(x, kernel, bias, dilation, hop_size): + """perform location-variable convolution operation on the input sequence (x) using the local convolution kernl. + Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100. + Args: + x (Tensor): the input sequence (batch, in_channels, in_length). + kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length) + bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length) + dilation (int): the dilation of convolution. + hop_size (int): the hop_size of the conditioning sequence. + Returns: + (Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length). + """ + batch, _, in_length = x.shape + batch, _, out_channels, kernel_size, kernel_length = kernel.shape + + assert in_length == ( + kernel_length * hop_size + ), f"length of (x, kernel) is not matched, {in_length} vs {kernel_length * hop_size}" + + padding = dilation * int((kernel_size - 1) / 2) + x = F.pad(x, (padding, padding), "constant", 0) # (batch, in_channels, in_length + 2*padding) + x = x.unfold(2, hop_size + 2 * padding, hop_size) # (batch, in_channels, kernel_length, hop_size + 2*padding) + + if hop_size < dilation: + x = F.pad(x, (0, dilation), "constant", 0) + x = x.unfold( + 3, dilation, dilation + ) # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation) + x = x[:, :, :, :, :hop_size] + x = x.transpose(3, 4) # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation) + x = x.unfold(4, kernel_size, 1) # (batch, in_channels, kernel_length, dilation, _, kernel_size) + + o = torch.einsum("bildsk,biokl->bolsd", x, kernel) + o = o + bias.unsqueeze(-1).unsqueeze(-1) + o = o.contiguous().view(batch, out_channels, -1) + return o diff --git a/TTS/vocoder/models/__init__.py b/TTS/vocoder/models/__init__.py index cbd3950b..9479095e 100644 --- a/TTS/vocoder/models/__init__.py +++ b/TTS/vocoder/models/__init__.py @@ -31,6 +31,7 @@ def setup_model(config: Coqpit): def setup_generator(c): + """ TODO: use config object as arguments""" print(" > Generator Model: {}".format(c.generator_model)) MyModel = importlib.import_module("TTS.vocoder.models." + c.generator_model.lower()) MyModel = getattr(MyModel, to_camel(c.generator_model)) @@ -85,12 +86,15 @@ def setup_generator(c): use_weight_norm=True, upsample_factors=c.generator_model_params["upsample_factors"], ) + elif c.generator_model.lower() in "univnet_generator": + model = MyModel(**c.generator_model_params) else: raise NotImplementedError(f"Model {c.generator_model} not implemented!") return model def setup_discriminator(c): + """ TODO: use config objekt as arguments""" print(" > Discriminator Model: {}".format(c.discriminator_model)) if "parallel_wavegan" in c.discriminator_model: MyModel = importlib.import_module("TTS.vocoder.models.parallel_wavegan_discriminator") @@ -144,4 +148,6 @@ def setup_discriminator(c): nonlinear_activation_params={"negative_slope": 0.2}, bias=True, ) + if c.discriminator_model == "univnet_discriminator": + model = MyModel() return model diff --git a/TTS/vocoder/models/univnet_discriminator.py b/TTS/vocoder/models/univnet_discriminator.py new file mode 100644 index 00000000..d99b2760 --- /dev/null +++ b/TTS/vocoder/models/univnet_discriminator.py @@ -0,0 +1,96 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.utils import spectral_norm, weight_norm + +from TTS.utils.audio import TorchSTFT +from TTS.vocoder.models.hifigan_discriminator import MultiPeriodDiscriminator + +LRELU_SLOPE = 0.1 + + +class SpecDiscriminator(nn.Module): + """docstring for Discriminator.""" + + def __init__(self, fft_size=1024, hop_length=120, win_length=600, use_spectral_norm=False): + super().__init__() + norm_f = weight_norm if use_spectral_norm is False else spectral_norm + self.fft_size = fft_size + self.hop_length = hop_length + self.win_length = win_length + self.stft = TorchSTFT(fft_size, hop_length, win_length) + self.discriminators = nn.ModuleList( + [ + norm_f(nn.Conv2d(1, 32, kernel_size=(3, 9), padding=(1, 4))), + norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))), + norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))), + norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))), + norm_f(nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))), + ] + ) + + self.out = norm_f(nn.Conv2d(32, 1, 3, 1, 1)) + + def forward(self, y): + + fmap = [] + with torch.no_grad(): + y = y.squeeze(1) + y = self.stft(y) + y = y.unsqueeze(1) + for _, d in enumerate(self.discriminators): + y = d(y) + y = F.leaky_relu(y, LRELU_SLOPE) + fmap.append(y) + + y = self.out(y) + fmap.append(y) + + return torch.flatten(y, 1, -1), fmap + + +class MultiResSpecDiscriminator(torch.nn.Module): + def __init__( # pylint: disable=dangerous-default-value + self, fft_sizes=[1024, 2048, 512], hop_sizes=[120, 240, 50], win_lengths=[600, 1200, 240], window="hann_window" + ): + + super().__init__() + self.discriminators = nn.ModuleList( + [ + SpecDiscriminator(fft_sizes[0], hop_sizes[0], win_lengths[0], window), + SpecDiscriminator(fft_sizes[1], hop_sizes[1], win_lengths[1], window), + SpecDiscriminator(fft_sizes[2], hop_sizes[2], win_lengths[2], window), + ] + ) + + def forward(self, x): + scores = [] + feats = [] + for d in self.discriminators: + score, feat = d(x) + scores.append(score) + feats.append(feat) + + return scores, feats + + +class UnivnetDiscriminator(nn.Module): + """Univnet discriminator wrapping MPD and MSD.""" + + def __init__(self): + super().__init__() + self.mpd = MultiPeriodDiscriminator() + self.msd = MultiResSpecDiscriminator() + + def forward(self, x): + """ + Args: + x (Tensor): input waveform. + + Returns: + List[Tensor]: discriminator scores. + List[List[Tensor]]: list of list of features from each layers of each discriminator. + """ + scores, feats = self.mpd(x) + scores_, feats_ = self.msd(x) + return scores + scores_, feats + feats_ diff --git a/TTS/vocoder/models/univnet_generator.py b/TTS/vocoder/models/univnet_generator.py new file mode 100644 index 00000000..4604abb2 --- /dev/null +++ b/TTS/vocoder/models/univnet_generator.py @@ -0,0 +1,145 @@ +import numpy as np +import torch +import torch.nn.functional as F + +from TTS.vocoder.layers.lvc_block import LVCBlock + +LRELU_SLOPE = 0.1 + + +class UnivnetGenerator(torch.nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + cond_channels, + upsample_factors, + lvc_layers_each_block, + lvc_kernel_size, + kpnet_hidden_channels, + kpnet_conv_size, + dropout, + use_weight_norm=True, + ): + + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.cond_channels = cond_channels + self.upsample_scale = np.prod(upsample_factors) + self.lvc_block_nums = len(upsample_factors) + + # define first convolution + self.first_conv = torch.nn.Conv1d( + in_channels, hidden_channels, kernel_size=7, padding=(7 - 1) // 2, dilation=1, bias=True + ) + + # define residual blocks + self.lvc_blocks = torch.nn.ModuleList() + cond_hop_length = 1 + for n in range(self.lvc_block_nums): + cond_hop_length = cond_hop_length * upsample_factors[n] + lvcb = LVCBlock( + in_channels=hidden_channels, + cond_channels=cond_channels, + upsample_ratio=upsample_factors[n], + conv_layers=lvc_layers_each_block, + conv_kernel_size=lvc_kernel_size, + cond_hop_length=cond_hop_length, + kpnet_hidden_channels=kpnet_hidden_channels, + kpnet_conv_size=kpnet_conv_size, + kpnet_dropout=dropout, + ) + self.lvc_blocks += [lvcb] + + # define output layers + self.last_conv_layers = torch.nn.ModuleList( + [ + torch.nn.Conv1d( + hidden_channels, out_channels, kernel_size=7, padding=(7 - 1) // 2, dilation=1, bias=True + ), + ] + ) + + # apply weight norm + if use_weight_norm: + self.apply_weight_norm() + + def forward(self, c): + """Calculate forward propagation. + Args: + c (Tensor): Local conditioning auxiliary features (B, C ,T'). + Returns: + Tensor: Output tensor (B, out_channels, T) + """ + # random noise + x = torch.randn([c.shape[0], self.in_channels, c.shape[2]]) + x = x.to(self.first_conv.bias.device) + x = self.first_conv(x) + + for n in range(self.lvc_block_nums): + x = self.lvc_blocks[n](x, c) + + # apply final layers + for f in self.last_conv_layers: + x = F.leaky_relu(x, LRELU_SLOPE) + x = f(x) + x = torch.tanh(x) + return x + + def remove_weight_norm(self): + """Remove weight normalization module from all of the layers.""" + + def _remove_weight_norm(m): + try: + # print(f"Weight norm is removed from {m}.") + torch.nn.utils.remove_weight_norm(m) + except ValueError: # this module didn't have weight norm + return + + self.apply(_remove_weight_norm) + + def apply_weight_norm(self): + """Apply weight normalization module from all of the layers.""" + + def _apply_weight_norm(m): + if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): + torch.nn.utils.weight_norm(m) + # print(f"Weight norm is applied to {m}.") + + self.apply(_apply_weight_norm) + + @staticmethod + def _get_receptive_field_size(layers, stacks, kernel_size, dilation=lambda x: 2 ** x): + assert layers % stacks == 0 + layers_per_cycle = layers // stacks + dilations = [dilation(i % layers_per_cycle) for i in range(layers)] + return (kernel_size - 1) * sum(dilations) + 1 + + @property + def receptive_field_size(self): + """Return receptive field size.""" + return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size) + + def inference(self, c=None, x=None): + """Perform inference. + Args: + c (Union[Tensor, ndarray]): Local conditioning auxiliary features (T' ,C). + x (Union[Tensor, ndarray]): Input noise signal (T, 1). + Returns: + Tensor: Output tensor (T, out_channels) + """ + if x is not None: + if not isinstance(x, torch.Tensor): + x = torch.tensor(x, dtype=torch.float).to(next(self.parameters()).device) + x = x.transpose(1, 0).unsqueeze(0) + else: + assert c is not None + x = torch.randn(1, 1, len(c) * self.upsample_factor).to(next(self.parameters()).device) + if c is not None: + if not isinstance(c, torch.Tensor): + c = torch.tensor(c, dtype=torch.float).to(next(self.parameters()).device) + c = c.transpose(1, 0).unsqueeze(0) + c = torch.nn.ReplicationPad1d(self.aux_context_window)(c) + return self.forward(c).squeeze(0).transpose(1, 0) diff --git a/recipes/ljspeech/univnet/train.py b/recipes/ljspeech/univnet/train.py new file mode 100644 index 00000000..d8f33ae3 --- /dev/null +++ b/recipes/ljspeech/univnet/train.py @@ -0,0 +1,29 @@ +import os + +from TTS.trainer import Trainer, TrainingArgs, init_training +from TTS.vocoder.configs import UnivnetConfig + +output_path = os.path.dirname(os.path.abspath(__file__)) +config = UnivnetConfig( + batch_size=64, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + seq_len=8192, + pad_short=2000, + use_noise_augment=True, + eval_split_size=10, + print_step=25, + print_eval=False, + mixed_precision=False, + lr_gen=1e-4, + lr_disc=1e-4, + data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), + output_path=output_path, +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger) +trainer.fit() From 42554cc711b4a471fd35724a31daf85da0e843d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 20 May 2021 18:22:52 +0200 Subject: [PATCH 114/258] rename MyDataset -> TTSDataset --- TTS/bin/compute_attention_masks.py | 4 ++-- TTS/bin/extract_tts_spectrograms.py | 4 ++-- TTS/bin/train_align_tts.py | 4 ++-- TTS/bin/train_glow_tts.py | 4 ++-- TTS/bin/train_speedy_speech.py | 4 ++-- TTS/bin/train_tacotron.py | 4 ++-- TTS/tts/datasets/TTSDataset.py | 10 +++++----- notebooks/ExtractTTSpectrogram.ipynb | 4 ++-- tests/data_tests/test_loader.py | 2 +- 9 files changed, 20 insertions(+), 20 deletions(-) diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index 0a4337da..e14ff433 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -8,7 +8,7 @@ import torch from torch.utils.data import DataLoader from tqdm import tqdm -from TTS.tts.datasets.TTSDataset import MyDataset +from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.utils.generic_utils import setup_model from TTS.tts.utils.io import load_checkpoint from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols @@ -83,7 +83,7 @@ Example run: preprocessor = importlib.import_module("TTS.tts.datasets.preprocess") preprocessor = getattr(preprocessor, args.dataset) meta_data = preprocessor(args.data_path, args.dataset_metafile) - dataset = MyDataset( + dataset = TTSDataset( model.decoder.r, C.text_cleaner, compute_linear_spec=False, diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index fb3a8321..da6de9c0 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -11,7 +11,7 @@ from tqdm import tqdm from TTS.config import load_config from TTS.tts.datasets.preprocess import load_meta_data -from TTS.tts.datasets.TTSDataset import MyDataset +from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.utils.generic_utils import setup_model from TTS.tts.utils.speakers import parse_speakers from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols @@ -22,7 +22,7 @@ use_cuda = torch.cuda.is_available() def setup_loader(ap, r, verbose=False): - dataset = MyDataset( + dataset = TTSDataset( r, c.text_cleaner, compute_linear_spec=False, diff --git a/TTS/bin/train_align_tts.py b/TTS/bin/train_align_tts.py index 7e3921b0..f5658dd2 100644 --- a/TTS/bin/train_align_tts.py +++ b/TTS/bin/train_align_tts.py @@ -14,7 +14,7 @@ from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler from TTS.tts.datasets.preprocess import load_meta_data -from TTS.tts.datasets.TTSDataset import MyDataset +from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import AlignTTSLoss from TTS.tts.utils.generic_utils import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint @@ -38,7 +38,7 @@ def setup_loader(ap, r, is_val=False, verbose=False): if is_val and not config.run_eval: loader = None else: - dataset = MyDataset( + dataset = TTSDataset( r, config.text_cleaner, compute_linear_spec=False, diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index e93a4e8a..50e95a2b 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -15,7 +15,7 @@ from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler from TTS.tts.datasets.preprocess import load_meta_data -from TTS.tts.datasets.TTSDataset import MyDataset +from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import GlowTTSLoss from TTS.tts.utils.generic_utils import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint @@ -38,7 +38,7 @@ def setup_loader(ap, r, is_val=False, verbose=False): if is_val and not config.run_eval: loader = None else: - dataset = MyDataset( + dataset = TTSDataset( r, config.text_cleaner, compute_linear_spec=False, diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py index 2fba3df1..4ab0c899 100644 --- a/TTS/bin/train_speedy_speech.py +++ b/TTS/bin/train_speedy_speech.py @@ -16,7 +16,7 @@ from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler from TTS.tts.datasets.preprocess import load_meta_data -from TTS.tts.datasets.TTSDataset import MyDataset +from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import SpeedySpeechLoss from TTS.tts.utils.generic_utils import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint @@ -39,7 +39,7 @@ def setup_loader(ap, r, is_val=False, verbose=False): if is_val and not config.run_eval: loader = None else: - dataset = MyDataset( + dataset = TTSDataset( r, config.text_cleaner, compute_linear_spec=False, diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index 9685d0d7..098a8d3f 100755 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -12,7 +12,7 @@ import torch from torch.utils.data import DataLoader from TTS.tts.datasets.preprocess import load_meta_data -from TTS.tts.datasets.TTSDataset import MyDataset +from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import TacotronLoss from TTS.tts.utils.generic_utils import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint @@ -43,7 +43,7 @@ def setup_loader(ap, r, is_val=False, verbose=False, dataset=None): loader = None else: if dataset is None: - dataset = MyDataset( + dataset = TTSDataset( r, config.text_cleaner, compute_linear_spec=config.model.lower() == "tacotron", diff --git a/TTS/tts/datasets/TTSDataset.py b/TTS/tts/datasets/TTSDataset.py index 4ca93232..cbb0a593 100644 --- a/TTS/tts/datasets/TTSDataset.py +++ b/TTS/tts/datasets/TTSDataset.py @@ -12,7 +12,7 @@ from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor from TTS.tts.utils.text import pad_with_eos_bos, phoneme_to_sequence, text_to_sequence -class MyDataset(Dataset): +class TTSDataset(Dataset): def __init__( self, outputs_per_step, @@ -117,12 +117,12 @@ class MyDataset(Dataset): try: phonemes = np.load(cache_path) except FileNotFoundError: - phonemes = MyDataset._generate_and_cache_phoneme_sequence( + phonemes = TTSDataset._generate_and_cache_phoneme_sequence( text, cache_path, cleaners, language, tp, add_blank ) except (ValueError, IOError): print(" [!] failed loading phonemes for {}. " "Recomputing.".format(wav_file)) - phonemes = MyDataset._generate_and_cache_phoneme_sequence( + phonemes = TTSDataset._generate_and_cache_phoneme_sequence( text, cache_path, cleaners, language, tp, add_blank ) if enable_eos_bos: @@ -190,7 +190,7 @@ class MyDataset(Dataset): item = args[0] func_args = args[1] text, wav_file, *_ = item - phonemes = MyDataset._load_or_generate_phoneme_sequence(wav_file, text, *func_args) + phonemes = TTSDataset._load_or_generate_phoneme_sequence(wav_file, text, *func_args) return phonemes def compute_input_seq(self, num_workers=0): @@ -225,7 +225,7 @@ class MyDataset(Dataset): with Pool(num_workers) as p: phonemes = list( tqdm.tqdm( - p.imap(MyDataset._phoneme_worker, [[item, func_args] for item in self.items]), + p.imap(TTSDataset._phoneme_worker, [[item, func_args] for item in self.items]), total=len(self.items), ) ) diff --git a/notebooks/ExtractTTSpectrogram.ipynb b/notebooks/ExtractTTSpectrogram.ipynb index dc35e86f..bdc7c955 100644 --- a/notebooks/ExtractTTSpectrogram.ipynb +++ b/notebooks/ExtractTTSpectrogram.ipynb @@ -22,7 +22,7 @@ "import numpy as np\n", "from tqdm import tqdm as tqdm\n", "from torch.utils.data import DataLoader\n", - "from TTS.tts.datasets.TTSDataset import MyDataset\n", + "from TTS.tts.datasets.TTSDataset import TTSDataset\n", "from TTS.tts.layers.losses import L1LossMasked\n", "from TTS.utils.audio import AudioProcessor\n", "from TTS.utils.io import load_config\n", @@ -112,7 +112,7 @@ "preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n", "preprocessor = getattr(preprocessor, DATASET.lower())\n", "meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n", - "dataset = MyDataset(checkpoint['r'], C.text_cleaner, False, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n", + "dataset = TTSDataset(checkpoint['r'], C.text_cleaner, False, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n", "loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False)" ] }, diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index e2dba37a..053da516 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -38,7 +38,7 @@ class TestTTSDataset(unittest.TestCase): def _create_dataloader(self, batch_size, r, bgs): items = ljspeech(c.data_path, "metadata.csv") - dataset = TTSDataset.MyDataset( + dataset = TTSDataset.TTSDataset( r, c.text_cleaner, compute_linear_spec=True, From 8def3c87aff69d350b1c59eaf83cc0ebf1925ef2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 20 May 2021 18:23:53 +0200 Subject: [PATCH 115/258] trainer-API updates --- TTS/bin/train_align_tts.py | 2 +- TTS/bin/train_glow_tts.py | 2 +- TTS/bin/train_speedy_speech.py | 2 +- TTS/bin/train_tacotron.py | 2 +- TTS/bin/train_vocoder_gan.py | 2 +- TTS/bin/train_vocoder_wavegrad.py | 2 +- TTS/bin/train_vocoder_wavernn.py | 2 +- TTS/tts/configs/shared_configs.py | 20 ++++++++++++++++ TTS/tts/configs/tacotron_config.py | 21 +++++++++++------ TTS/tts/utils/speakers.py | 29 +++++++++++++----------- TTS/tts/utils/text/cleaners.py | 6 ++--- TTS/utils/tensorboard_logger.py | 2 +- tests/vocoder_tests/test_melgan_train.py | 1 + 13 files changed, 62 insertions(+), 31 deletions(-) diff --git a/TTS/bin/train_align_tts.py b/TTS/bin/train_align_tts.py index f5658dd2..d231484a 100644 --- a/TTS/bin/train_align_tts.py +++ b/TTS/bin/train_align_tts.py @@ -229,7 +229,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, if global_step % config.tb_plot_step == 0: iter_stats = {"lr": current_lr, "grad_norm": grad_norm, "step_time": step_time} iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) + tb_logger.tb_train_step_stats(global_step, iter_stats) if global_step % config.save_step == 0: if config.checkpoint: diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index 50e95a2b..9a455a1b 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -270,7 +270,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, if global_step % config.tb_plot_step == 0: iter_stats = {"lr": current_lr, "grad_norm": grad_norm, "step_time": step_time} iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) + tb_logger.tb_train_step_stats(global_step, iter_stats) if global_step % config.save_step == 0: if config.checkpoint: diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py index 4ab0c899..742a27d8 100644 --- a/TTS/bin/train_speedy_speech.py +++ b/TTS/bin/train_speedy_speech.py @@ -256,7 +256,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, if global_step % config.tb_plot_step == 0: iter_stats = {"lr": current_lr, "grad_norm": grad_norm, "step_time": step_time} iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) + tb_logger.tb_train_step_stats(global_step, iter_stats) if global_step % config.save_step == 0: if config.checkpoint: diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index 098a8d3f..b5e38b80 100755 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -327,7 +327,7 @@ def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler, ap, "step_time": step_time, } iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) + tb_logger.tb_train_step_stats(global_step, iter_stats) if global_step % config.save_step == 0: if config.checkpoint: diff --git a/TTS/bin/train_vocoder_gan.py b/TTS/bin/train_vocoder_gan.py index 123d5a43..ea317ef6 100755 --- a/TTS/bin/train_vocoder_gan.py +++ b/TTS/bin/train_vocoder_gan.py @@ -265,7 +265,7 @@ def train( if global_step % 10 == 0: iter_stats = {"lr_G": current_lr_G, "lr_D": current_lr_D, "step_time": step_time} iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) + tb_logger.tb_train_step_stats(global_step, iter_stats) # save checkpoint if global_step % c.save_step == 0: diff --git a/TTS/bin/train_vocoder_wavegrad.py b/TTS/bin/train_vocoder_wavegrad.py index c0fcff51..c8f067ee 100644 --- a/TTS/bin/train_vocoder_wavegrad.py +++ b/TTS/bin/train_vocoder_wavegrad.py @@ -181,7 +181,7 @@ def train(model, criterion, optimizer, scheduler, scaler, ap, global_step, epoch if global_step % 10 == 0: iter_stats = {"lr": current_lr, "grad_norm": grad_norm.item(), "step_time": step_time} iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) + tb_logger.tb_train_step_stats(global_step, iter_stats) # save checkpoint if global_step % c.save_step == 0: diff --git a/TTS/bin/train_vocoder_wavernn.py b/TTS/bin/train_vocoder_wavernn.py index bcad9493..86a1506a 100644 --- a/TTS/bin/train_vocoder_wavernn.py +++ b/TTS/bin/train_vocoder_wavernn.py @@ -163,7 +163,7 @@ def train(model, optimizer, criterion, scheduler, scaler, ap, global_step, epoch if global_step % 10 == 0: iter_stats = {"lr": cur_lr, "step_time": step_time} iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) + tb_logger.tb_train_step_stats(global_step, iter_stats) # save checkpoint if global_step % c.save_step == 0: diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index a501a880..a2d935c7 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -133,6 +133,18 @@ class BaseTTSConfig(BaseTrainingConfig): datasets (List[BaseDatasetConfig]): List of datasets used for training. If multiple datasets are provided, they are merged and used together for training. + optimizer (str): + Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`. + Defaults to ``. + optimizer_params (dict): + Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}` + lr_scheduler (str): + Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or + `TTS.utils.training`. Defaults to ``. + lr_scheduler_params (dict): + Parameters for the generator learning rate scheduler. Defaults to `{"warmup": 4000}`. + test_sentences (List[str]): + List of sentences to be used at testing. Defaults to '[]' """ audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) @@ -158,3 +170,11 @@ class BaseTTSConfig(BaseTrainingConfig): add_blank: bool = False # dataset datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) + # optimizer + optimizer: str = MISSING + optimizer_params: dict = MISSING + # scheduler + lr_scheduler: str = '' + lr_scheduler_params: dict = field(default_factory=lambda: {}) + # testing + test_sentences: List[str] = field(default_factory=lambda:[]) diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index a567cd88..ff8d89bb 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -78,10 +78,16 @@ class TacotronConfig(BaseTTSConfig): enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. external_speaker_embedding_file (str): Path to the file including pre-computed speaker embeddings. Defaults to None. - noam_schedule (bool): - enable / disable the use of Noam LR scheduler. Defaults to False. - warmup_steps (int): - Number of warm-up steps for the Noam scheduler. Defaults 4000. + optimizer (str): + Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`. + Defaults to `RAdam`. + optimizer_params (dict): + Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}` + lr_scheduler (str): + Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or + `TTS.utils.training`. Defaults to `NoamLR`. + lr_scheduler_params (dict): + Parameters for the generator learning rate scheduler. Defaults to `{"warmup": 4000}`. lr (float): Initial learning rate. Defaults to `1e-4`. wd (float): @@ -152,10 +158,11 @@ class TacotronConfig(BaseTTSConfig): external_speaker_embedding_file: str = False # optimizer parameters - noam_schedule: bool = False - warmup_steps: int = 4000 + optimizer: str = "RAdam" + optimizer_params: dict = field(default_factory=lambda: {'betas': [0.9, 0.998], 'weight_decay': 1e-6}) + lr_scheduler: str = "NoamLR" + lr_scheduler_params: dict = field(default_factory=lambda:{"warmup_steps": 4000}) lr: float = 1e-4 - wd: float = 1e-6 grad_clip: float = 5.0 seq_len_norm: bool = False loss_masking: bool = True diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 84da1f72..4ab78f88 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -1,7 +1,7 @@ import json import os import random -from typing import Union +from typing import Union, List, Any import numpy as np import torch @@ -35,9 +35,7 @@ def save_speaker_mapping(out_path, speaker_mapping): def get_speakers(items): - """Returns a sorted, unique list of speakers in a given dataset.""" - speakers = {e[2] for e in items} - return sorted(speakers) + def parse_speakers(c, args, meta_data_train, OUT_PATH): @@ -121,26 +119,31 @@ class SpeakerManager: Args: x_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "". - speaker_id_file_path (str, optional): Path to the metafile that maps speaker names to ids used by the - TTS model. Defaults to "". + speaker_id_file_path (str, optional): Path to the metafile that maps speaker names to ids used by + TTS models. Defaults to "". encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "". encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "". """ def __init__( self, + data_items: List[List[Any]] = None, x_vectors_file_path: str = "", speaker_id_file_path: str = "", encoder_model_path: str = "", encoder_config_path: str = "", ): - self.x_vectors = None - self.speaker_ids = None - self.clip_ids = None + self.data_items = [] + self.x_vectors = [] + self.speaker_ids = [] + self.clip_ids = [] self.speaker_encoder = None self.speaker_encoder_ap = None + if data_items: + self.speaker_ids = self.parse_speakers() + if x_vectors_file_path: self.load_x_vectors_file(x_vectors_file_path) @@ -169,10 +172,10 @@ class SpeakerManager: return len(self.x_vectors[list(self.x_vectors.keys())[0]]["embedding"]) def parser_speakers_from_items(self, items: list): - speaker_ids = sorted({item[2] for item in items}) - self.speaker_ids = speaker_ids - num_speakers = len(speaker_ids) - return speaker_ids, num_speakers + speakers = sorted({item[2] for item in items}) + self.speaker_ids = {name: i for i, name in enumerate(speakers)} + num_speakers = len(self.speaker_ids) + return self.speaker_ids, num_speakers def save_ids_file(self, file_path: str): self._save_json(file_path, self.speaker_ids) diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 3d2caa97..4b041ed8 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -65,7 +65,7 @@ def basic_cleaners(text): def transliteration_cleaners(text): """Pipeline for non-English text that transliterates to ASCII.""" - text = convert_to_ascii(text) + # text = convert_to_ascii(text) text = lowercase(text) text = collapse_whitespace(text) return text @@ -89,7 +89,7 @@ def basic_turkish_cleaners(text): def english_cleaners(text): """Pipeline for English text, including number and abbreviation expansion.""" - text = convert_to_ascii(text) + # text = convert_to_ascii(text) text = lowercase(text) text = expand_time_english(text) text = expand_numbers(text) @@ -129,7 +129,7 @@ def chinese_mandarin_cleaners(text: str) -> str: def phoneme_cleaners(text): """Pipeline for phonemes mode, including number and abbreviation expansion.""" text = expand_numbers(text) - text = convert_to_ascii(text) + # text = convert_to_ascii(text) text = expand_abbreviations(text) text = replace_symbols(text) text = remove_aux_symbols(text) diff --git a/TTS/utils/tensorboard_logger.py b/TTS/utils/tensorboard_logger.py index 3874a42b..657deb5b 100644 --- a/TTS/utils/tensorboard_logger.py +++ b/TTS/utils/tensorboard_logger.py @@ -39,7 +39,7 @@ class TensorboardLogger(object): except RuntimeError: traceback.print_exc() - def tb_train_iter_stats(self, step, stats): + def tb_train_step_stats(self, step, stats): self.dict_to_tb_scalar(f"{self.model_name}_TrainIterStats", stats, step) def tb_train_epoch_stats(self, step, stats): diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py index 3ff65b5a..e3004db7 100644 --- a/tests/vocoder_tests/test_melgan_train.py +++ b/tests/vocoder_tests/test_melgan_train.py @@ -21,6 +21,7 @@ config = MelganConfig( print_step=1, discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, print_eval=True, + discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, data_path="tests/data/ljspeech", output_path=output_path, ) From d09385808a21afbc55f8b3343dbda292917e4c81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 10:36:26 +0200 Subject: [PATCH 116/258] set test_sentences in config --- TTS/tts/configs/tacotron_config.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index ff8d89bb..2fc7cc78 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -1,4 +1,4 @@ -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig, GSTConfig @@ -176,6 +176,15 @@ class TacotronConfig(BaseTTSConfig): postnet_ssim_alpha: float = 0.25 ga_alpha: float = 5.0 + # testing + test_sentences: List[str] = field(default_factory=lambda:[ + "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + "Be a voice, not an echo.", + "I'm sorry Dave. I'm afraid I can't do that.", + "This cake is great. It's so delicious and moist.", + "Prior to November 22, 1963." + ]) + def check_values(self): if self.gradual_training: assert ( From b9bccbb24329fe5995439c93fcede8d2a69401f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 10:37:01 +0200 Subject: [PATCH 117/258] move load_meta_data and related functions to `datasets/__init__.py` --- TTS/tts/datasets/__init__.py | 88 ++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index e69de29b..b238209f 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -0,0 +1,88 @@ +import sys +import numpy as np +from collections import Counter +from pathlib import Path +from TTS.tts.datasets.TTSDataset import TTSDataset +from TTS.tts.datasets.formatters import * + +#################### +# UTILITIES +#################### + + +def split_dataset(items): + speakers = [item[-1] for item in items] + is_multi_speaker = len(set(speakers)) > 1 + eval_split_size = min(500, int(len(items) * 0.01)) + assert eval_split_size > 0, " [!] You do not have enough samples to train. You need at least 100 samples." + np.random.seed(0) + np.random.shuffle(items) + if is_multi_speaker: + items_eval = [] + speakers = [item[-1] for item in items] + speaker_counter = Counter(speakers) + while len(items_eval) < eval_split_size: + item_idx = np.random.randint(0, len(items)) + speaker_to_be_removed = items[item_idx][-1] + if speaker_counter[speaker_to_be_removed] > 1: + items_eval.append(items[item_idx]) + speaker_counter[speaker_to_be_removed] -= 1 + del items[item_idx] + return items_eval, items + return items[:eval_split_size], items[eval_split_size:] + + +def load_meta_data(datasets, eval_split=True): + meta_data_train_all = [] + meta_data_eval_all = [] if eval_split else None + for dataset in datasets: + name = dataset["name"] + root_path = dataset["path"] + meta_file_train = dataset["meta_file_train"] + meta_file_val = dataset["meta_file_val"] + # setup the right data processor + preprocessor = _get_preprocessor_by_name(name) + # load train set + meta_data_train = preprocessor(root_path, meta_file_train) + print( + f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}" + ) + # load evaluation split if set + if eval_split: + if meta_file_val: + meta_data_eval = preprocessor(root_path, meta_file_val) + else: + meta_data_eval, meta_data_train = split_dataset( + meta_data_train) + meta_data_eval_all += meta_data_eval + meta_data_train_all += meta_data_train + # load attention masks for duration predictor training + if dataset.meta_file_attn_mask: + meta_data = dict( + load_attention_mask_meta_data(dataset["meta_file_attn_mask"])) + for idx, ins in enumerate(meta_data_train_all): + attn_file = meta_data[ins[1]].strip() + meta_data_train_all[idx].append(attn_file) + if meta_data_eval_all: + for idx, ins in enumerate(meta_data_eval_all): + attn_file = meta_data[ins[1]].strip() + meta_data_eval_all[idx].append(attn_file) + return meta_data_train_all, meta_data_eval_all + + +def load_attention_mask_meta_data(metafile_path): + """Load meta data file created by compute_attention_masks.py""" + with open(metafile_path, "r") as f: + lines = f.readlines() + + meta_data = [] + for line in lines: + wav_file, attn_file = line.split("|") + meta_data.append([wav_file, attn_file]) + return meta_data + + +def _get_preprocessor_by_name(name): + """Returns the respective preprocessing function.""" + thismodule = sys.modules[__name__] + return getattr(thismodule, name.lower()) From a20a1c7d063cc69ebe4462451be494654dc72603 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 10:38:05 +0200 Subject: [PATCH 118/258] rename preprocess.py -> formatters.py --- .../datasets/{preprocess.py => formatters.py} | 81 ------------------- TTS/utils/arguments.py | 39 ++++++--- 2 files changed, 26 insertions(+), 94 deletions(-) rename TTS/tts/datasets/{preprocess.py => formatters.py} (80%) diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/formatters.py similarity index 80% rename from TTS/tts/datasets/preprocess.py rename to TTS/tts/datasets/formatters.py index 62cb9fef..f43733b1 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/formatters.py @@ -1,93 +1,12 @@ import os import re -import sys import xml.etree.ElementTree as ET -from collections import Counter from glob import glob from pathlib import Path from typing import List -import numpy as np from tqdm import tqdm -#################### -# UTILITIES -#################### - - -def split_dataset(items): - speakers = [item[-1] for item in items] - is_multi_speaker = len(set(speakers)) > 1 - eval_split_size = min(500, int(len(items) * 0.01)) - assert eval_split_size > 0, " [!] You do not have enough samples to train. You need at least 100 samples." - np.random.seed(0) - np.random.shuffle(items) - if is_multi_speaker: - items_eval = [] - speakers = [item[-1] for item in items] - speaker_counter = Counter(speakers) - while len(items_eval) < eval_split_size: - item_idx = np.random.randint(0, len(items)) - speaker_to_be_removed = items[item_idx][-1] - if speaker_counter[speaker_to_be_removed] > 1: - items_eval.append(items[item_idx]) - speaker_counter[speaker_to_be_removed] -= 1 - del items[item_idx] - return items_eval, items - return items[:eval_split_size], items[eval_split_size:] - - -def load_meta_data(datasets, eval_split=True): - meta_data_train_all = [] - meta_data_eval_all = [] if eval_split else None - for dataset in datasets: - name = dataset["name"] - root_path = dataset["path"] - meta_file_train = dataset["meta_file_train"] - meta_file_val = dataset["meta_file_val"] - # setup the right data processor - preprocessor = get_preprocessor_by_name(name) - # load train set - meta_data_train = preprocessor(root_path, meta_file_train) - print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}") - # load evaluation split if set - if eval_split: - if meta_file_val: - meta_data_eval = preprocessor(root_path, meta_file_val) - else: - meta_data_eval, meta_data_train = split_dataset(meta_data_train) - meta_data_eval_all += meta_data_eval - meta_data_train_all += meta_data_train - # load attention masks for duration predictor training - if dataset.meta_file_attn_mask: - meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"])) - for idx, ins in enumerate(meta_data_train_all): - attn_file = meta_data[ins[1]].strip() - meta_data_train_all[idx].append(attn_file) - if meta_data_eval_all: - for idx, ins in enumerate(meta_data_eval_all): - attn_file = meta_data[ins[1]].strip() - meta_data_eval_all[idx].append(attn_file) - return meta_data_train_all, meta_data_eval_all - - -def load_attention_mask_meta_data(metafile_path): - """Load meta data file created by compute_attention_masks.py""" - with open(metafile_path, "r") as f: - lines = f.readlines() - - meta_data = [] - for line in lines: - wav_file, attn_file = line.split("|") - meta_data.append([wav_file, attn_file]) - return meta_data - - -def get_preprocessor_by_name(name): - """Returns the respective preprocessing function.""" - thismodule = sys.modules[__name__] - return getattr(thismodule, name.lower()) - ######################## # DATASETS diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 5e6acd1d..3fc63e26 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -30,16 +30,16 @@ def init_arguments(argv): parser.add_argument( "--continue_path", type=str, - help=( - "Training output folder to continue training. Used to continue " - "a training. If it is used, 'config_path' is ignored." - ), + help=("Training output folder to continue training. Used to continue " + "a training. If it is used, 'config_path' is ignored."), default="", required="--config_path" not in argv, ) parser.add_argument( - "--restore_path", type=str, help="Model file to be restored. Use to finetune a model.", default="" - ) + "--restore_path", + type=str, + help="Model file to be restored. Use to finetune a model.", + default="") parser.add_argument( "--best_path", type=str, @@ -49,12 +49,23 @@ def init_arguments(argv): ), default="", ) + parser.add_argument("--config_path", + type=str, + help="Path to config file for training.", + required="--continue_path" not in argv) + parser.add_argument("--debug", + type=bool, + default=False, + help="Do not verify commit integrity to run training.") parser.add_argument( - "--config_path", type=str, help="Path to config file for training.", required="--continue_path" not in argv - ) - parser.add_argument("--debug", type=bool, default=False, help="Do not verify commit integrity to run training.") - parser.add_argument("--rank", type=int, default=0, help="DISTRIBUTED: process rank for distributed training.") - parser.add_argument("--group_id", type=str, default="", help="DISTRIBUTED: process group id.") + "--rank", + type=int, + default=0, + help="DISTRIBUTED: process rank for distributed training.") + parser.add_argument("--group_id", + type=str, + default="", + help="DISTRIBUTED: process group id.") return parser @@ -149,7 +160,8 @@ def process_args(args): print(" > Mixed precision mode is ON") experiment_path = args.continue_path if not experiment_path: - experiment_path = create_experiment_folder(config.output_path, config.run_name, args.debug) + experiment_path = create_experiment_folder(config.output_path, + config.run_name, args.debug) audio_path = os.path.join(experiment_path, "test_audios") # setup rank 0 process in distributed training tb_logger = None @@ -170,7 +182,8 @@ def process_args(args): os.chmod(experiment_path, 0o775) tb_logger = TensorboardLogger(experiment_path, model_name=config.model) # write model desc to tensorboard - tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) + tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", + 0) c_logger = ConsoleLogger() return config, experiment_path, audio_path, c_logger, tb_logger From 844abb3b1d05e990400fb13aff76f4e8a4029949 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 10:38:44 +0200 Subject: [PATCH 119/258] `setup_loss()` in `layer/__init__.py` --- TTS/tts/layers/__init__.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/TTS/tts/layers/__init__.py b/TTS/tts/layers/__init__.py index e69de29b..78f56a5d 100644 --- a/TTS/tts/layers/__init__.py +++ b/TTS/tts/layers/__init__.py @@ -0,0 +1,15 @@ +from TTS.tts.layers.losses import * + + +def setup_loss(config): + if config.model.lower() in ["tacotron", "tacotron2"]: + model = TacotronLoss(config) + elif config.model.lower() == "glow_tts": + model = GlowTTSLoss() + elif config.model.lower() == "speedy_speech": + model = SpeedySpeechLoss(config) + elif config.model.lower() == "align_tts": + model = AlignTTSLoss(config) + else: + raise ValueError(f" [!] loss for model {config.model.lower()} cannot be found.") + return model From ca302db7b0bec4e4f9a4a5c5f2c79b269eb2dc31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 14:38:31 +0200 Subject: [PATCH 120/258] add sequence_mask to `utils.data` --- TTS/tts/layers/losses.py | 2 +- TTS/tts/utils/data.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 729a21af..27c6e9e5 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -5,7 +5,7 @@ import torch from torch import nn from torch.nn import functional -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask from TTS.tts.utils.ssim import ssim diff --git a/TTS/tts/utils/data.py b/TTS/tts/utils/data.py index 259a32d9..5f8624e6 100644 --- a/TTS/tts/utils/data.py +++ b/TTS/tts/utils/data.py @@ -1,3 +1,4 @@ +import torch import numpy as np @@ -65,3 +66,12 @@ class StandardScaler: X *= self.scale_ X += self.mean_ return X + + +# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1 +def sequence_mask(sequence_length, max_len=None): + if max_len is None: + max_len = sequence_length.data.max() + seq_range = torch.arange(max_len, dtype=sequence_length.dtype, device=sequence_length.device) + # B x T_max + return seq_range.unsqueeze(0) < sequence_length.unsqueeze(1) From da7d10e53c90232a9700758948488c0b9870ddfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 10:40:23 +0200 Subject: [PATCH 121/258] mode `setup_model()` to `models/__init__.py` --- TTS/tts/models/__init__.py | 108 +++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py index e69de29b..153f8d43 100644 --- a/TTS/tts/models/__init__.py +++ b/TTS/tts/models/__init__.py @@ -0,0 +1,108 @@ +from TTS.utils.generic_utils import find_module + + +def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): + print(" > Using model: {}".format(c.model)) + MyModel = find_module("TTS.tts.models", c.model.lower()) + if c.model.lower() in "tacotron": + model = MyModel( + num_chars=num_chars + getattr(c, "add_blank", False), + num_speakers=num_speakers, + r=c.r, + postnet_output_dim=int(c.audio["fft_size"] / 2 + 1), + decoder_output_dim=c.audio["num_mels"], + use_gst=c.use_gst, + gst=c.gst, + memory_size=c.memory_size, + attn_type=c.attention_type, + attn_win=c.windowing, + attn_norm=c.attention_norm, + prenet_type=c.prenet_type, + prenet_dropout=c.prenet_dropout, + prenet_dropout_at_inference=c.prenet_dropout_at_inference, + forward_attn=c.use_forward_attn, + trans_agent=c.transition_agent, + forward_attn_mask=c.forward_attn_mask, + location_attn=c.location_attn, + attn_K=c.attention_heads, + separate_stopnet=c.separate_stopnet, + bidirectional_decoder=c.bidirectional_decoder, + double_decoder_consistency=c.double_decoder_consistency, + ddc_r=c.ddc_r, + speaker_embedding_dim=speaker_embedding_dim, + ) + elif c.model.lower() == "tacotron2": + model = MyModel( + num_chars=num_chars + getattr(c, "add_blank", False), + num_speakers=num_speakers, + r=c.r, + postnet_output_dim=c.audio["num_mels"], + decoder_output_dim=c.audio["num_mels"], + use_gst=c.use_gst, + gst=c.gst, + attn_type=c.attention_type, + attn_win=c.windowing, + attn_norm=c.attention_norm, + prenet_type=c.prenet_type, + prenet_dropout=c.prenet_dropout, + prenet_dropout_at_inference=c.prenet_dropout_at_inference, + forward_attn=c.use_forward_attn, + trans_agent=c.transition_agent, + forward_attn_mask=c.forward_attn_mask, + location_attn=c.location_attn, + attn_K=c.attention_heads, + separate_stopnet=c.separate_stopnet, + bidirectional_decoder=c.bidirectional_decoder, + double_decoder_consistency=c.double_decoder_consistency, + ddc_r=c.ddc_r, + speaker_embedding_dim=speaker_embedding_dim, + ) + elif c.model.lower() == "glow_tts": + model = MyModel( + num_chars=num_chars + getattr(c, "add_blank", False), + hidden_channels_enc=c["hidden_channels_encoder"], + hidden_channels_dec=c["hidden_channels_decoder"], + hidden_channels_dp=c["hidden_channels_duration_predictor"], + out_channels=c.audio["num_mels"], + encoder_type=c.encoder_type, + encoder_params=c.encoder_params, + use_encoder_prenet=c["use_encoder_prenet"], + inference_noise_scale=c.inference_noise_scale, + num_flow_blocks_dec=12, + kernel_size_dec=5, + dilation_rate=1, + num_block_layers=4, + dropout_p_dec=0.05, + num_speakers=num_speakers, + c_in_channels=0, + num_splits=4, + num_squeeze=2, + sigmoid_scale=False, + mean_only=True, + speaker_embedding_dim=speaker_embedding_dim, + ) + elif c.model.lower() == "speedy_speech": + model = MyModel( + num_chars=num_chars + getattr(c, "add_blank", False), + out_channels=c.audio["num_mels"], + hidden_channels=c["hidden_channels"], + positional_encoding=c["positional_encoding"], + encoder_type=c["encoder_type"], + encoder_params=c["encoder_params"], + decoder_type=c["decoder_type"], + decoder_params=c["decoder_params"], + c_in_channels=0, + ) + elif c.model.lower() == "align_tts": + model = MyModel( + num_chars=num_chars + getattr(c, "add_blank", False), + out_channels=c.audio["num_mels"], + hidden_channels=c["hidden_channels"], + hidden_channels_dp=c["hidden_channels_dp"], + encoder_type=c["encoder_type"], + encoder_params=c["encoder_params"], + decoder_type=c["decoder_type"], + decoder_params=c["decoder_params"], + c_in_channels=0, + ) + return model From 5a2e75f0ee21e1c44404ed70717746c35fba5da5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 10:41:22 +0200 Subject: [PATCH 122/258] import missings for tacotron.py --- TTS/tts/models/tacotron.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 89d98e9f..4413b015 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -2,6 +2,8 @@ import torch from torch import nn +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.tts.layers.tacotron.gst_layers import GST from TTS.tts.layers.tacotron.tacotron import Decoder, Encoder, PostCBHG from TTS.tts.models.tacotron_abstract import TacotronAbstract From bdbfc95618dc5ba1bb10dc29bf1efdd510b8f9e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 10:42:16 +0200 Subject: [PATCH 123/258] add `gradual_training` argument to tacotron.py --- TTS/tts/models/tacotron.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 4413b015..c1d95a25 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -47,8 +47,9 @@ class Tacotron(TacotronAbstract): gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. memory_size (int, optional): size of the history queue fed to the prenet. Model feeds the last ```memory_size``` output frames to the prenet. + gradual_trainin (List): Gradual training schedule. If None or `[]`, no gradual training is used. + Defaults to `[]`. """ - def __init__( self, num_chars, @@ -77,6 +78,7 @@ class Tacotron(TacotronAbstract): use_gst=False, gst=None, memory_size=5, + gradual_training=[] ): super().__init__( num_chars, @@ -104,6 +106,7 @@ class Tacotron(TacotronAbstract): speaker_embedding_dim, use_gst, gst, + gradual_training ) # speaker embedding layers From 535a458f40ad8890a3b07d83fad116689a7c5553 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 14:36:06 +0200 Subject: [PATCH 124/258] update Tacotron models for the trainer --- TTS/tts/configs/tacotron_config.py | 1 + TTS/tts/models/tacotron.py | 198 ++++++++++++++---- TTS/tts/models/tacotron2.py | 308 +++++++++++++++++----------- TTS/tts/models/tacotron_abstract.py | 26 ++- 4 files changed, 373 insertions(+), 160 deletions(-) diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index 2fc7cc78..90decaa3 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -126,6 +126,7 @@ class TacotronConfig(BaseTTSConfig): use_gst: bool = False gst: GSTConfig = None gst_style_input: str = None + # model specific params r: int = 2 gradual_training: List[List[int]] = None diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index c1d95a25..23bd839f 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -113,7 +113,8 @@ class Tacotron(TacotronAbstract): if self.num_speakers > 1: if not self.embeddings_per_sample: speaker_embedding_dim = 256 - self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) + self.speaker_embedding = nn.Embedding(self.num_speakers, + speaker_embedding_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) # speaker and gst embeddings is concat in decoder input @@ -144,7 +145,8 @@ class Tacotron(TacotronAbstract): separate_stopnet, ) self.postnet = PostCBHG(decoder_output_dim) - self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, postnet_output_dim) + self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, + postnet_output_dim) # setup prenet dropout self.decoder.prenet.dropout_at_inference = prenet_dropout_at_inference @@ -181,93 +183,203 @@ class Tacotron(TacotronAbstract): separate_stopnet, ) - def forward(self, characters, text_lengths, mel_specs, mel_lengths=None, speaker_ids=None, speaker_embeddings=None): + def forward(self, + text, + text_lengths, + mel_specs=None, + mel_lengths=None, + cond_input=None): """ Shapes: - characters: [B, T_in] + text: [B, T_in] text_lengths: [B] mel_specs: [B, T_out, C] mel_lengths: [B] - speaker_ids: [B, 1] - speaker_embeddings: [B, C] + cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] """ + outputs = { + 'alignments_backward': None, + 'decoder_outputs_backward': None + } input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) # B x T_in x embed_dim - inputs = self.embedding(characters) + inputs = self.embedding(text) # B x T_in x encoder_in_features encoder_outputs = self.encoder(inputs) # sequence masking - encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) + encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as( + encoder_outputs) # global style token if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, speaker_embeddings) + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, + cond_input['x_vectors']) # speaker embedding if self.num_speakers > 1: if not self.embeddings_per_sample: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + speaker_embeddings = self.speaker_embedding(cond_input['speaker_ids'])[:, + None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + speaker_embeddings = torch.unsqueeze(cond_input['x_vectors'], 1) + encoder_outputs = self._concat_speaker_embedding( + encoder_outputs, speaker_embeddings) # decoder_outputs: B x decoder_in_features x T_out # alignments: B x T_in x encoder_in_features # stop_tokens: B x T_in - decoder_outputs, alignments, stop_tokens = self.decoder(encoder_outputs, mel_specs, input_mask) + decoder_outputs, alignments, stop_tokens = self.decoder( + encoder_outputs, mel_specs, input_mask) # sequence masking if output_mask is not None: - decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs) + decoder_outputs = decoder_outputs * output_mask.unsqueeze( + 1).expand_as(decoder_outputs) # B x T_out x decoder_in_features postnet_outputs = self.postnet(decoder_outputs) # sequence masking if output_mask is not None: - postnet_outputs = postnet_outputs * output_mask.unsqueeze(2).expand_as(postnet_outputs) + postnet_outputs = postnet_outputs * output_mask.unsqueeze( + 2).expand_as(postnet_outputs) # B x T_out x posnet_dim postnet_outputs = self.last_linear(postnet_outputs) # B x T_out x decoder_in_features decoder_outputs = decoder_outputs.transpose(1, 2).contiguous() if self.bidirectional_decoder: - decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask) - return ( - decoder_outputs, - postnet_outputs, - alignments, - stop_tokens, - decoder_outputs_backward, - alignments_backward, - ) + decoder_outputs_backward, alignments_backward = self._backward_pass( + mel_specs, encoder_outputs, input_mask) + outputs['alignments_backward'] = alignments_backward + outputs['decoder_outputs_backward'] = decoder_outputs_backward if self.double_decoder_consistency: decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass( - mel_specs, encoder_outputs, alignments, input_mask - ) - return ( - decoder_outputs, - postnet_outputs, - alignments, - stop_tokens, - decoder_outputs_backward, - alignments_backward, - ) - return decoder_outputs, postnet_outputs, alignments, stop_tokens + mel_specs, encoder_outputs, alignments, input_mask) + outputs['alignments_backward'] = alignments_backward + outputs['decoder_outputs_backward'] = decoder_outputs_backward + outputs.update({ + 'postnet_outputs': postnet_outputs, + 'decoder_outputs': decoder_outputs, + 'alignments': alignments, + 'stop_tokens': stop_tokens + }) + return outputs @torch.no_grad() - def inference(self, characters, speaker_ids=None, style_mel=None, speaker_embeddings=None): - inputs = self.embedding(characters) + def inference(self, + text_input, + cond_input=None): + inputs = self.embedding(text_input) encoder_outputs = self.encoder(inputs) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel, speaker_embeddings) + encoder_outputs = self.compute_gst(encoder_outputs, cond_input['style_mel'], + cond_input['x_vectors']) if self.num_speakers > 1: if not self.embeddings_per_sample: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + speaker_embeddings = self.speaker_embedding(cond_input['speaker_ids'])[:, None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) - decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) + speaker_embeddings = torch.unsqueeze(cond_input['x_vectors'], 1) + encoder_outputs = self._concat_speaker_embedding( + encoder_outputs, speaker_embeddings) + decoder_outputs, alignments, stop_tokens = self.decoder.inference( + encoder_outputs) postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = self.last_linear(postnet_outputs) decoder_outputs = decoder_outputs.transpose(1, 2) - return decoder_outputs, postnet_outputs, alignments, stop_tokens + outputs = { + 'postnet_outputs': postnet_outputs, + 'decoder_outputs': decoder_outputs, + 'alignments': alignments, + 'stop_tokens': stop_tokens + } + return outputs + + def train_step(self, batch, criterion): + """Perform a single training step by fetching the right set if samples from the batch. + + Args: + batch ([type]): [description] + criterion ([type]): [description] + """ + text_input = batch['text_input'] + text_lengths = batch['text_lengths'] + mel_input = batch['mel_input'] + mel_lengths = batch['mel_lengths'] + linear_input = batch['linear_input'] + stop_targets = batch['stop_targets'] + speaker_ids = batch['speaker_ids'] + x_vectors = batch['x_vectors'] + + # forward pass model + outputs = self.forward(text_input, + text_lengths, + mel_input, + mel_lengths, + cond_input={ + 'speaker_ids': speaker_ids, + 'x_vectors': x_vectors + }) + + # set the [alignment] lengths wrt reduction factor for guided attention + if mel_lengths.max() % self.decoder.r != 0: + alignment_lengths = ( + mel_lengths + + (self.decoder.r - + (mel_lengths.max() % self.decoder.r))) // self.decoder.r + else: + alignment_lengths = mel_lengths // self.decoder.r + + cond_input = {'speaker_ids': speaker_ids, 'x_vectors': x_vectors} + outputs = self.forward(text_input, text_lengths, mel_input, + mel_lengths, cond_input) + + # compute loss + loss_dict = criterion( + outputs['postnet_outputs'], + outputs['decoder_outputs'], + mel_input, + linear_input, + outputs['stop_tokens'], + stop_targets, + mel_lengths, + outputs['decoder_outputs_backward'], + outputs['alignments'], + alignment_lengths, + outputs['alignments_backward'], + text_lengths, + ) + + # compute alignment error (the lower the better ) + align_error = 1 - alignment_diagonal_score(outputs['alignments']) + loss_dict["align_error"] = align_error + return outputs, loss_dict + + def train_log(self, ap, batch, outputs): + postnet_outputs = outputs['postnet_outputs'] + alignments = outputs['alignments'] + alignments_backward = outputs['alignments_backward'] + mel_input = batch['mel_input'] + + pred_spec = postnet_outputs[0].data.cpu().numpy() + gt_spec = mel_input[0].data.cpu().numpy() + align_img = alignments[0].data.cpu().numpy() + + figures = { + "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), + "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), + "alignment": plot_alignment(align_img, output_fig=False), + } + + if self.bidirectional_decoder or self.double_decoder_consistency: + figures["alignment_backward"] = plot_alignment( + alignments_backward[0].data.cpu().numpy(), output_fig=False) + + # Sample audio + train_audio = ap.inv_spectrogram(pred_spec.T) + return figures, train_audio + + def eval_step(self, batch, criterion): + return self.train_step(batch, criterion) + + def eval_log(self, ap, batch, outputs): + return self.train_log(ap, batch, outputs) \ No newline at end of file diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 525eb8b3..51b181e4 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -1,12 +1,15 @@ +# coding: utf-8 +import numpy as np import torch from torch import nn +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.tts.layers.tacotron.gst_layers import GST from TTS.tts.layers.tacotron.tacotron2 import Decoder, Encoder, Postnet from TTS.tts.models.tacotron_abstract import TacotronAbstract -# TODO: match function arguments with tacotron class Tacotron2(TacotronAbstract): """Tacotron2 as in https://arxiv.org/abs/1712.05884 @@ -43,69 +46,52 @@ class Tacotron2(TacotronAbstract): speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None. use_gst (bool, optional): enable/disable Global style token module. gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. + gradual_trainin (List): Gradual training schedule. If None or `[]`, no gradual training is used. + Defaults to `[]`. """ - - def __init__( - self, - num_chars, - num_speakers, - r, - postnet_output_dim=80, - decoder_output_dim=80, - attn_type="original", - attn_win=False, - attn_norm="softmax", - prenet_type="original", - prenet_dropout=True, - prenet_dropout_at_inference=False, - forward_attn=False, - trans_agent=False, - forward_attn_mask=False, - location_attn=True, - attn_K=5, - separate_stopnet=True, - bidirectional_decoder=False, - double_decoder_consistency=False, - ddc_r=None, - encoder_in_features=512, - decoder_in_features=512, - speaker_embedding_dim=None, - use_gst=False, - gst=None, - ): - super().__init__( - num_chars, - num_speakers, - r, - postnet_output_dim, - decoder_output_dim, - attn_type, - attn_win, - attn_norm, - prenet_type, - prenet_dropout, - prenet_dropout_at_inference, - forward_attn, - trans_agent, - forward_attn_mask, - location_attn, - attn_K, - separate_stopnet, - bidirectional_decoder, - double_decoder_consistency, - ddc_r, - encoder_in_features, - decoder_in_features, - speaker_embedding_dim, - use_gst, - gst, - ) + def __init__(self, + num_chars, + num_speakers, + r, + postnet_output_dim=80, + decoder_output_dim=80, + attn_type="original", + attn_win=False, + attn_norm="softmax", + prenet_type="original", + prenet_dropout=True, + prenet_dropout_at_inference=False, + forward_attn=False, + trans_agent=False, + forward_attn_mask=False, + location_attn=True, + attn_K=5, + separate_stopnet=True, + bidirectional_decoder=False, + double_decoder_consistency=False, + ddc_r=None, + encoder_in_features=512, + decoder_in_features=512, + speaker_embedding_dim=None, + use_gst=False, + gst=None, + gradual_training=[]): + super().__init__(num_chars, num_speakers, r, postnet_output_dim, + decoder_output_dim, attn_type, attn_win, attn_norm, + prenet_type, prenet_dropout, + prenet_dropout_at_inference, forward_attn, + trans_agent, forward_attn_mask, location_attn, attn_K, + separate_stopnet, bidirectional_decoder, + double_decoder_consistency, ddc_r, + encoder_in_features, decoder_in_features, + speaker_embedding_dim, use_gst, gst, gradual_training) # speaker embedding layer if self.num_speakers > 1: if not self.embeddings_per_sample: speaker_embedding_dim = 512 - self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) + self.speaker_embedding = nn.Embedding(self.num_speakers, + speaker_embedding_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) # speaker and gst embeddings is concat in decoder input @@ -176,16 +162,24 @@ class Tacotron2(TacotronAbstract): mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2) return mel_outputs, mel_outputs_postnet, alignments - def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None, speaker_embeddings=None): + def forward(self, + text, + text_lengths, + mel_specs=None, + mel_lengths=None, + cond_input=None): """ Shapes: text: [B, T_in] text_lengths: [B] mel_specs: [B, T_out, C] mel_lengths: [B] - speaker_ids: [B, 1] - speaker_embeddings: [B, C] + cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] """ + outputs = { + 'alignments_backward': None, + 'decoder_outputs_backward': None + } # compute mask for padding # B x T_in_max (boolean) input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) @@ -195,94 +189,176 @@ class Tacotron2(TacotronAbstract): encoder_outputs = self.encoder(embedded_inputs, text_lengths) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, speaker_embeddings) + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, + cond_input['x_vectors']) if self.num_speakers > 1: if not self.embeddings_per_sample: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + speaker_embeddings = self.speaker_embedding(cond_input['speaker_ids'])[:, + None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + speaker_embeddings = torch.unsqueeze(cond_input['x_vectors'], 1) + encoder_outputs = self._concat_speaker_embedding( + encoder_outputs, speaker_embeddings) - encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) + encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as( + encoder_outputs) # B x mel_dim x T_out -- B x T_out//r x T_in -- B x T_out//r - decoder_outputs, alignments, stop_tokens = self.decoder(encoder_outputs, mel_specs, input_mask) + decoder_outputs, alignments, stop_tokens = self.decoder( + encoder_outputs, mel_specs, input_mask) # sequence masking if mel_lengths is not None: - decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs) + decoder_outputs = decoder_outputs * output_mask.unsqueeze( + 1).expand_as(decoder_outputs) # B x mel_dim x T_out postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = decoder_outputs + postnet_outputs # sequence masking if output_mask is not None: - postnet_outputs = postnet_outputs * output_mask.unsqueeze(1).expand_as(postnet_outputs) + postnet_outputs = postnet_outputs * output_mask.unsqueeze( + 1).expand_as(postnet_outputs) # B x T_out x mel_dim -- B x T_out x mel_dim -- B x T_out//r x T_in - decoder_outputs, postnet_outputs, alignments = self.shape_outputs(decoder_outputs, postnet_outputs, alignments) + decoder_outputs, postnet_outputs, alignments = self.shape_outputs( + decoder_outputs, postnet_outputs, alignments) if self.bidirectional_decoder: - decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask) - return ( - decoder_outputs, - postnet_outputs, - alignments, - stop_tokens, - decoder_outputs_backward, - alignments_backward, - ) + decoder_outputs_backward, alignments_backward = self._backward_pass( + mel_specs, encoder_outputs, input_mask) + outputs['alignments_backward'] = alignments_backward + outputs['decoder_outputs_backward'] = decoder_outputs_backward if self.double_decoder_consistency: decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass( - mel_specs, encoder_outputs, alignments, input_mask - ) - return ( - decoder_outputs, - postnet_outputs, - alignments, - stop_tokens, - decoder_outputs_backward, - alignments_backward, - ) - return decoder_outputs, postnet_outputs, alignments, stop_tokens + mel_specs, encoder_outputs, alignments, input_mask) + outputs['alignments_backward'] = alignments_backward + outputs['decoder_outputs_backward'] = decoder_outputs_backward + outputs.update({ + 'postnet_outputs': postnet_outputs, + 'decoder_outputs': decoder_outputs, + 'alignments': alignments, + 'stop_tokens': stop_tokens + }) + return outputs @torch.no_grad() - def inference(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None): + def inference(self, text, cond_input=None): embedded_inputs = self.embedding(text).transpose(1, 2) encoder_outputs = self.encoder.inference(embedded_inputs) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel, speaker_embeddings) + encoder_outputs = self.compute_gst(encoder_outputs, cond_input['style_mel'], + cond_input['x_vectors']) if self.num_speakers > 1: if not self.embeddings_per_sample: - speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] - speaker_embeddings = torch.unsqueeze(speaker_embeddings, 0).transpose(1, 2) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + x_vector = self.speaker_embedding(cond_input['speaker_ids'])[:, None] + x_vector = torch.unsqueeze(x_vector, 0).transpose(1, 2) + else: + x_vector = cond_input - decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) + encoder_outputs = self._concat_speaker_embedding( + encoder_outputs, x_vector) + + decoder_outputs, alignments, stop_tokens = self.decoder.inference( + encoder_outputs) postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = decoder_outputs + postnet_outputs - decoder_outputs, postnet_outputs, alignments = self.shape_outputs(decoder_outputs, postnet_outputs, alignments) - return decoder_outputs, postnet_outputs, alignments, stop_tokens + decoder_outputs, postnet_outputs, alignments = self.shape_outputs( + decoder_outputs, postnet_outputs, alignments) + outputs = { + 'postnet_outputs': postnet_outputs, + 'decoder_outputs': decoder_outputs, + 'alignments': alignments, + 'stop_tokens': stop_tokens + } + return outputs - def inference_truncated(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None): + def train_step(self, batch, criterion): + """Perform a single training step by fetching the right set if samples from the batch. + + Args: + batch ([type]): [description] + criterion ([type]): [description] """ - Preserve model states for continuous inference - """ - embedded_inputs = self.embedding(text).transpose(1, 2) - encoder_outputs = self.encoder.inference_truncated(embedded_inputs) + text_input = batch['text_input'] + text_lengths = batch['text_lengths'] + mel_input = batch['mel_input'] + mel_lengths = batch['mel_lengths'] + linear_input = batch['linear_input'] + stop_targets = batch['stop_targets'] + speaker_ids = batch['speaker_ids'] + x_vectors = batch['x_vectors'] - if self.gst: - # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel, speaker_embeddings) + # forward pass model + outputs = self.forward(text_input, + text_lengths, + mel_input, + mel_lengths, + cond_input={ + 'speaker_ids': speaker_ids, + 'x_vectors': x_vectors + }) - if self.num_speakers > 1: - if not self.embeddings_per_sample: - speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] - speaker_embeddings = torch.unsqueeze(speaker_embeddings, 0).transpose(1, 2) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + # set the [alignment] lengths wrt reduction factor for guided attention + if mel_lengths.max() % self.decoder.r != 0: + alignment_lengths = ( + mel_lengths + + (self.decoder.r - + (mel_lengths.max() % self.decoder.r))) // self.decoder.r + else: + alignment_lengths = mel_lengths // self.decoder.r - mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated(encoder_outputs) - mel_outputs_postnet = self.postnet(mel_outputs) - mel_outputs_postnet = mel_outputs + mel_outputs_postnet - mel_outputs, mel_outputs_postnet, alignments = self.shape_outputs(mel_outputs, mel_outputs_postnet, alignments) - return mel_outputs, mel_outputs_postnet, alignments, stop_tokens + cond_input = {'speaker_ids': speaker_ids, 'x_vectors': x_vectors} + outputs = self.forward(text_input, text_lengths, mel_input, + mel_lengths, cond_input) + + # compute loss + loss_dict = criterion( + outputs['model_outputs'], + outputs['decoder_outputs'], + mel_input, + linear_input, + outputs['stop_tokens'], + stop_targets, + mel_lengths, + outputs['decoder_outputs_backward'], + outputs['alignments'], + alignment_lengths, + outputs['alignments_backward'], + text_lengths, + ) + + # compute alignment error (the lower the better ) + align_error = 1 - alignment_diagonal_score(outputs['alignments']) + loss_dict["align_error"] = align_error + return outputs, loss_dict + + def train_log(self, ap, batch, outputs): + postnet_outputs = outputs['model_outputs'] + alignments = outputs['alignments'] + alignments_backward = outputs['alignments_backward'] + mel_input = batch['mel_input'] + + pred_spec = postnet_outputs[0].data.cpu().numpy() + gt_spec = mel_input[0].data.cpu().numpy() + align_img = alignments[0].data.cpu().numpy() + + figures = { + "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), + "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), + "alignment": plot_alignment(align_img, output_fig=False), + } + + if self.bidirectional_decoder or self.double_decoder_consistency: + figures["alignment_backward"] = plot_alignment( + alignments_backward[0].data.cpu().numpy(), output_fig=False) + + # Sample audio + train_audio = ap.inv_melspectrogram(pred_spec.T) + return figures, train_audio + + def eval_step(self, batch, criterion): + return self.train_step(batch, criterion) + + def eval_log(self, ap, batch, outputs): + return self.train_log(ap, batch, outputs) diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py index e684ce7c..2bea06a9 100644 --- a/TTS/tts/models/tacotron_abstract.py +++ b/TTS/tts/models/tacotron_abstract.py @@ -1,10 +1,12 @@ import copy +import logging from abc import ABC, abstractmethod import torch from torch import nn -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask +from TTS.utils.training import gradual_training_scheduler class TacotronAbstract(ABC, nn.Module): @@ -35,6 +37,7 @@ class TacotronAbstract(ABC, nn.Module): speaker_embedding_dim=None, use_gst=False, gst=None, + gradual_training=[] ): """Abstract Tacotron class""" super().__init__() @@ -63,6 +66,7 @@ class TacotronAbstract(ABC, nn.Module): self.encoder_in_features = encoder_in_features self.decoder_in_features = decoder_in_features self.speaker_embedding_dim = speaker_embedding_dim + self.gradual_training = gradual_training # layers self.embedding = None @@ -216,3 +220,23 @@ class TacotronAbstract(ABC, nn.Module): speaker_embeddings_ = speaker_embeddings.expand(outputs.size(0), outputs.size(1), -1) outputs = torch.cat([outputs, speaker_embeddings_], dim=-1) return outputs + + ############################# + # CALLBACKS + ############################# + + def on_epoch_start(self, trainer): + """Callback for setting values wrt gradual training schedule. + + Args: + trainer (TrainerTTS): TTS trainer object that is used to train this model. + """ + if self.gradual_training: + r, trainer.config.batch_size = gradual_training_scheduler(trainer.total_steps_done, trainer.config) + trainer.config.r = r + self.decoder.set_r(r) + if trainer.config.bidirectional_decoder: + trainer.model.decoder_backward.set_r(r) + trainer.train_loader = trainer.setup_train_dataloader(self.ap, self.model.decoder.r, verbose=True) + trainer.eval_loader = trainer.setup_eval_dataloder(self.ap, self.model.decoder.r) + logging.info(f"\n > Number of output frames: {self.decoder.r}") From 130781dab67b4c9301a35278fe0f55ac9f94995b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 14:38:54 +0200 Subject: [PATCH 125/258] remove `tts.generic_utils` as all the functions are moved to other files --- TTS/tts/utils/generic_utils.py | 278 --------------------------------- 1 file changed, 278 deletions(-) delete mode 100644 TTS/tts/utils/generic_utils.py diff --git a/TTS/tts/utils/generic_utils.py b/TTS/tts/utils/generic_utils.py deleted file mode 100644 index b0e53f33..00000000 --- a/TTS/tts/utils/generic_utils.py +++ /dev/null @@ -1,278 +0,0 @@ -import torch - -from TTS.utils.generic_utils import find_module - - -# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1 -def sequence_mask(sequence_length, max_len=None): - if max_len is None: - max_len = sequence_length.data.max() - seq_range = torch.arange(max_len, dtype=sequence_length.dtype, device=sequence_length.device) - # B x T_max - return seq_range.unsqueeze(0) < sequence_length.unsqueeze(1) - - -def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): - print(" > Using model: {}".format(c.model)) - MyModel = find_module("TTS.tts.models", c.model.lower()) - if c.model.lower() in "tacotron": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - num_speakers=num_speakers, - r=c.r, - postnet_output_dim=int(c.audio["fft_size"] / 2 + 1), - decoder_output_dim=c.audio["num_mels"], - use_gst=c.use_gst, - gst=c.gst, - memory_size=c.memory_size, - attn_type=c.attention_type, - attn_win=c.windowing, - attn_norm=c.attention_norm, - prenet_type=c.prenet_type, - prenet_dropout=c.prenet_dropout, - prenet_dropout_at_inference=c.prenet_dropout_at_inference, - forward_attn=c.use_forward_attn, - trans_agent=c.transition_agent, - forward_attn_mask=c.forward_attn_mask, - location_attn=c.location_attn, - attn_K=c.attention_heads, - separate_stopnet=c.separate_stopnet, - bidirectional_decoder=c.bidirectional_decoder, - double_decoder_consistency=c.double_decoder_consistency, - ddc_r=c.ddc_r, - speaker_embedding_dim=speaker_embedding_dim, - ) - elif c.model.lower() == "tacotron2": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - num_speakers=num_speakers, - r=c.r, - postnet_output_dim=c.audio["num_mels"], - decoder_output_dim=c.audio["num_mels"], - use_gst=c.use_gst, - gst=c.gst, - attn_type=c.attention_type, - attn_win=c.windowing, - attn_norm=c.attention_norm, - prenet_type=c.prenet_type, - prenet_dropout=c.prenet_dropout, - prenet_dropout_at_inference=c.prenet_dropout_at_inference, - forward_attn=c.use_forward_attn, - trans_agent=c.transition_agent, - forward_attn_mask=c.forward_attn_mask, - location_attn=c.location_attn, - attn_K=c.attention_heads, - separate_stopnet=c.separate_stopnet, - bidirectional_decoder=c.bidirectional_decoder, - double_decoder_consistency=c.double_decoder_consistency, - ddc_r=c.ddc_r, - speaker_embedding_dim=speaker_embedding_dim, - ) - elif c.model.lower() == "glow_tts": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - hidden_channels_enc=c["hidden_channels_encoder"], - hidden_channels_dec=c["hidden_channels_decoder"], - hidden_channels_dp=c["hidden_channels_duration_predictor"], - out_channels=c.audio["num_mels"], - encoder_type=c.encoder_type, - encoder_params=c.encoder_params, - use_encoder_prenet=c["use_encoder_prenet"], - inference_noise_scale=c.inference_noise_scale, - num_flow_blocks_dec=12, - kernel_size_dec=5, - dilation_rate=1, - num_block_layers=4, - dropout_p_dec=0.05, - num_speakers=num_speakers, - c_in_channels=0, - num_splits=4, - num_squeeze=2, - sigmoid_scale=False, - mean_only=True, - speaker_embedding_dim=speaker_embedding_dim, - ) - elif c.model.lower() == "speedy_speech": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - out_channels=c.audio["num_mels"], - hidden_channels=c["hidden_channels"], - positional_encoding=c["positional_encoding"], - encoder_type=c["encoder_type"], - encoder_params=c["encoder_params"], - decoder_type=c["decoder_type"], - decoder_params=c["decoder_params"], - c_in_channels=0, - ) - elif c.model.lower() == "align_tts": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - out_channels=c.audio["num_mels"], - hidden_channels=c["hidden_channels"], - hidden_channels_dp=c["hidden_channels_dp"], - encoder_type=c["encoder_type"], - encoder_params=c["encoder_params"], - decoder_type=c["decoder_type"], - decoder_params=c["decoder_params"], - c_in_channels=0, - ) - return model - - -def is_tacotron(c): - return "tacotron" in c["model"].lower() - - -# def check_config_tts(c): -# check_argument('model', c, enum_list=['tacotron', 'tacotron2', 'glow_tts', 'speedy_speech', 'align_tts'], restricted=True, val_type=str) -# check_argument('run_name', c, restricted=True, val_type=str) -# check_argument('run_description', c, val_type=str) - -# # AUDIO -# # check_argument('audio', c, restricted=True, val_type=dict) - -# # audio processing parameters -# # check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056) -# # check_argument('fft_size', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058) -# # check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000) -# # check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length') -# # check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length') -# # check_argument('preemphasis', c['audio'], restricted=True, val_type=float, min_val=0, max_val=1) -# # check_argument('min_level_db', c['audio'], restricted=True, val_type=int, min_val=-1000, max_val=10) -# # check_argument('ref_level_db', c['audio'], restricted=True, val_type=int, min_val=0, max_val=1000) -# # check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5) -# # check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000) - -# # vocabulary parameters -# check_argument('characters', c, restricted=False, val_type=dict) -# check_argument('pad', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) -# check_argument('eos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) -# check_argument('bos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) -# check_argument('characters', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) -# check_argument('phonemes', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys() and c['use_phonemes'], val_type=str) -# check_argument('punctuations', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) - -# # normalization parameters -# # check_argument('signal_norm', c['audio'], restricted=True, val_type=bool) -# # check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool) -# # check_argument('max_norm', c['audio'], restricted=True, val_type=float, min_val=0.1, max_val=1000) -# # check_argument('clip_norm', c['audio'], restricted=True, val_type=bool) -# # check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000) -# # check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0) -# # check_argument('spec_gain', c['audio'], restricted=True, val_type=[int, float], min_val=1, max_val=100) -# # check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool) -# # check_argument('trim_db', c['audio'], restricted=True, val_type=int) - -# # training parameters -# # check_argument('batch_size', c, restricted=True, val_type=int, min_val=1) -# # check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1) -# # check_argument('r', c, restricted=True, val_type=int, min_val=1) -# # check_argument('gradual_training', c, restricted=False, val_type=list) -# # check_argument('mixed_precision', c, restricted=False, val_type=bool) -# # check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100) - -# # loss parameters -# # check_argument('loss_masking', c, restricted=True, val_type=bool) -# # if c['model'].lower() in ['tacotron', 'tacotron2']: -# # check_argument('decoder_loss_alpha', c, restricted=True, val_type=float, min_val=0) -# # check_argument('postnet_loss_alpha', c, restricted=True, val_type=float, min_val=0) -# # check_argument('postnet_diff_spec_alpha', c, restricted=True, val_type=float, min_val=0) -# # check_argument('decoder_diff_spec_alpha', c, restricted=True, val_type=float, min_val=0) -# # check_argument('decoder_ssim_alpha', c, restricted=True, val_type=float, min_val=0) -# # check_argument('postnet_ssim_alpha', c, restricted=True, val_type=float, min_val=0) -# # check_argument('ga_alpha', c, restricted=True, val_type=float, min_val=0) -# if c['model'].lower in ["speedy_speech", "align_tts"]: -# check_argument('ssim_alpha', c, restricted=True, val_type=float, min_val=0) -# check_argument('l1_alpha', c, restricted=True, val_type=float, min_val=0) -# check_argument('huber_alpha', c, restricted=True, val_type=float, min_val=0) - -# # validation parameters -# # check_argument('run_eval', c, restricted=True, val_type=bool) -# # check_argument('test_delay_epochs', c, restricted=True, val_type=int, min_val=0) -# # check_argument('test_sentences_file', c, restricted=False, val_type=str) - -# # optimizer -# check_argument('noam_schedule', c, restricted=False, val_type=bool) -# check_argument('grad_clip', c, restricted=True, val_type=float, min_val=0.0) -# check_argument('epochs', c, restricted=True, val_type=int, min_val=1) -# check_argument('lr', c, restricted=True, val_type=float, min_val=0) -# check_argument('wd', c, restricted=is_tacotron(c), val_type=float, min_val=0) -# check_argument('warmup_steps', c, restricted=True, val_type=int, min_val=0) -# check_argument('seq_len_norm', c, restricted=is_tacotron(c), val_type=bool) - -# # tacotron prenet -# # check_argument('memory_size', c, restricted=is_tacotron(c), val_type=int, min_val=-1) -# # check_argument('prenet_type', c, restricted=is_tacotron(c), val_type=str, enum_list=['original', 'bn']) -# # check_argument('prenet_dropout', c, restricted=is_tacotron(c), val_type=bool) - -# # attention -# check_argument('attention_type', c, restricted=is_tacotron(c), val_type=str, enum_list=['graves', 'original', 'dynamic_convolution']) -# check_argument('attention_heads', c, restricted=is_tacotron(c), val_type=int) -# check_argument('attention_norm', c, restricted=is_tacotron(c), val_type=str, enum_list=['sigmoid', 'softmax']) -# check_argument('windowing', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('use_forward_attn', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('forward_attn_mask', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('transition_agent', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('transition_agent', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('location_attn', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('bidirectional_decoder', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('double_decoder_consistency', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('ddc_r', c, restricted='double_decoder_consistency' in c.keys(), min_val=1, max_val=7, val_type=int) - -# if c['model'].lower() in ['tacotron', 'tacotron2']: -# # stopnet -# # check_argument('stopnet', c, restricted=is_tacotron(c), val_type=bool) -# # check_argument('separate_stopnet', c, restricted=is_tacotron(c), val_type=bool) - -# # Model Parameters for non-tacotron models -# if c['model'].lower in ["speedy_speech", "align_tts"]: -# check_argument('positional_encoding', c, restricted=True, val_type=type) -# check_argument('encoder_type', c, restricted=True, val_type=str) -# check_argument('encoder_params', c, restricted=True, val_type=dict) -# check_argument('decoder_residual_conv_bn_params', c, restricted=True, val_type=dict) - -# # GlowTTS parameters -# check_argument('encoder_type', c, restricted=not is_tacotron(c), val_type=str) - -# # tensorboard -# # check_argument('print_step', c, restricted=True, val_type=int, min_val=1) -# # check_argument('tb_plot_step', c, restricted=True, val_type=int, min_val=1) -# # check_argument('save_step', c, restricted=True, val_type=int, min_val=1) -# # check_argument('checkpoint', c, restricted=True, val_type=bool) -# # check_argument('tb_model_param_stats', c, restricted=True, val_type=bool) - -# # dataloading -# # pylint: disable=import-outside-toplevel -# from TTS.tts.utils.text import cleaners -# # check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=dir(cleaners)) -# # check_argument('enable_eos_bos_chars', c, restricted=True, val_type=bool) -# # check_argument('num_loader_workers', c, restricted=True, val_type=int, min_val=0) -# # check_argument('num_val_loader_workers', c, restricted=True, val_type=int, min_val=0) -# # check_argument('batch_group_size', c, restricted=True, val_type=int, min_val=0) -# # check_argument('min_seq_len', c, restricted=True, val_type=int, min_val=0) -# # check_argument('max_seq_len', c, restricted=True, val_type=int, min_val=10) -# # check_argument('compute_input_seq_cache', c, restricted=True, val_type=bool) - -# # paths -# # check_argument('output_path', c, restricted=True, val_type=str) - -# # multi-speaker and gst -# # check_argument('use_speaker_embedding', c, restricted=True, val_type=bool) -# # check_argument('use_external_speaker_embedding_file', c, restricted=c['use_speaker_embedding'], val_type=bool) -# # check_argument('external_speaker_embedding_file', c, restricted=c['use_external_speaker_embedding_file'], val_type=str) -# if c['model'].lower() in ['tacotron', 'tacotron2'] and c['use_gst']: -# # check_argument('use_gst', c, restricted=is_tacotron(c), val_type=bool) -# # check_argument('gst', c, restricted=is_tacotron(c), val_type=dict) -# # check_argument('gst_style_input', c['gst'], restricted=is_tacotron(c), val_type=[str, dict]) -# # check_argument('gst_embedding_dim', c['gst'], restricted=is_tacotron(c), val_type=int, min_val=0, max_val=1000) -# # check_argument('gst_use_speaker_embedding', c['gst'], restricted=is_tacotron(c), val_type=bool) -# # check_argument('gst_num_heads', c['gst'], restricted=is_tacotron(c), val_type=int, min_val=2, max_val=10) -# # check_argument('gst_num_style_tokens', c['gst'], restricted=is_tacotron(c), val_type=int, min_val=1, max_val=1000) - -# # datasets - checking only the first entry -# # check_argument('datasets', c, restricted=True, val_type=list) -# # for dataset_entry in c['datasets']: -# # check_argument('name', dataset_entry, restricted=True, val_type=str) -# # check_argument('path', dataset_entry, restricted=True, val_type=str) -# # check_argument('meta_file_train', dataset_entry, restricted=True, val_type=[str, list]) -# # check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str) From f4f83b63795cff8165b60d394c57d412248a427f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 14:40:25 +0200 Subject: [PATCH 126/258] update `synthesis.py` for the trainer --- TTS/tts/utils/speakers.py | 8 +- TTS/tts/utils/synthesis.py | 186 +++++++++++++++++++------------------ 2 files changed, 97 insertions(+), 97 deletions(-) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 4ab78f88..374139ee 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -34,10 +34,6 @@ def save_speaker_mapping(out_path, speaker_mapping): json.dump(speaker_mapping, f, indent=4) -def get_speakers(items): - - - def parse_speakers(c, args, meta_data_train, OUT_PATH): """Returns number of speakers, speaker embedding shape and speaker mapping""" if c.use_speaker_embedding: @@ -135,7 +131,7 @@ class SpeakerManager: ): self.data_items = [] - self.x_vectors = [] + self.x_vectors = {} self.speaker_ids = [] self.clip_ids = [] self.speaker_encoder = None @@ -171,7 +167,7 @@ class SpeakerManager: def x_vector_dim(self): return len(self.x_vectors[list(self.x_vectors.keys())[0]]["embedding"]) - def parser_speakers_from_items(self, items: list): + def parse_speakers_from_items(self, items: list): speakers = sorted({item[2] for item in items}) self.speaker_ids = {name: i for i, name in enumerate(speakers)} num_speakers = len(self.speaker_ids) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 0ddf7ebe..90017bb1 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -13,7 +13,7 @@ if "tensorflow" in installed or "tensorflow-gpu" in installed: import tensorflow as tf -def text_to_seqvec(text, CONFIG): +def text_to_seq(text, CONFIG): text_cleaner = [CONFIG.text_cleaner] # text ot phonemes to sequence vector if CONFIG.use_phonemes: @@ -59,81 +59,82 @@ def numpy_to_tf(np_array, dtype): def compute_style_mel(style_wav, ap, cuda=False): - style_mel = torch.FloatTensor(ap.melspectrogram(ap.load_wav(style_wav, sr=ap.sample_rate))).unsqueeze(0) + style_mel = torch.FloatTensor( + ap.melspectrogram(ap.load_wav(style_wav, + sr=ap.sample_rate))).unsqueeze(0) if cuda: return style_mel.cuda() return style_mel -def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None, speaker_embeddings=None): - if "tacotron" in CONFIG.model.lower(): - if CONFIG.gst: - decoder_output, postnet_output, alignments, stop_tokens = model.inference( - inputs, style_mel=style_mel, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings - ) - else: - if truncated: - decoder_output, postnet_output, alignments, stop_tokens = model.inference_truncated( - inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings - ) - else: - decoder_output, postnet_output, alignments, stop_tokens = model.inference( - inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings - ) - elif "glow" in CONFIG.model.lower(): - inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable - if hasattr(model, "module"): - # distributed model - postnet_output, _, _, _, alignments, _, _ = model.module.inference( - inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings - ) - else: - postnet_output, _, _, _, alignments, _, _ = model.inference( - inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings - ) - postnet_output = postnet_output.permute(0, 2, 1) - # these only belong to tacotron models. - decoder_output = None - stop_tokens = None - elif CONFIG.model.lower() in ["speedy_speech", "align_tts"]: - inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable - if hasattr(model, "module"): - # distributed model - postnet_output, alignments = model.module.inference( - inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings - ) - else: - postnet_output, alignments = model.inference( - inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings - ) - postnet_output = postnet_output.permute(0, 2, 1) - # these only belong to tacotron models. - decoder_output = None - stop_tokens = None - else: - raise ValueError("[!] Unknown model name.") - return decoder_output, postnet_output, alignments, stop_tokens +def run_model_torch(model, + inputs, + speaker_id=None, + style_mel=None, + x_vector=None): + outputs = model.inference(inputs, + cond_input={ + 'speaker_ids': speaker_id, + 'x_vector': x_vector, + 'style_mel': style_mel + }) + # elif "glow" in CONFIG.model.lower(): + # inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable + # if hasattr(model, "module"): + # # distributed model + # postnet_output, _, _, _, alignments, _, _ = model.module.inference( + # inputs, + # inputs_lengths, + # g=speaker_id if speaker_id is not None else speaker_embeddings) + # else: + # postnet_output, _, _, _, alignments, _, _ = model.inference( + # inputs, + # inputs_lengths, + # g=speaker_id if speaker_id is not None else speaker_embeddings) + # postnet_output = postnet_output.permute(0, 2, 1) + # # these only belong to tacotron models. + # decoder_output = None + # stop_tokens = None + # elif CONFIG.model.lower() in ["speedy_speech", "align_tts"]: + # inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable + # if hasattr(model, "module"): + # # distributed model + # postnet_output, alignments = model.module.inference( + # inputs, + # inputs_lengths, + # g=speaker_id if speaker_id is not None else speaker_embeddings) + # else: + # postnet_output, alignments = model.inference( + # inputs, + # inputs_lengths, + # g=speaker_id if speaker_id is not None else speaker_embeddings) + # postnet_output = postnet_output.permute(0, 2, 1) + # # these only belong to tacotron models. + # decoder_output = None + # stop_tokens = None + # else: + # raise ValueError("[!] Unknown model name.") + return outputs -def run_model_tf(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None): +def run_model_tf(model, inputs, CONFIG, speaker_id=None, style_mel=None): if CONFIG.gst and style_mel is not None: raise NotImplementedError(" [!] GST inference not implemented for TF") - if truncated: - raise NotImplementedError(" [!] Truncated inference not implemented for TF") if speaker_id is not None: raise NotImplementedError(" [!] Multi-Speaker not implemented for TF") # TODO: handle multispeaker case - decoder_output, postnet_output, alignments, stop_tokens = model(inputs, training=False) + decoder_output, postnet_output, alignments, stop_tokens = model( + inputs, training=False) return decoder_output, postnet_output, alignments, stop_tokens -def run_model_tflite(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None): +def run_model_tflite(model, inputs, CONFIG, speaker_id=None, style_mel=None): if CONFIG.gst and style_mel is not None: - raise NotImplementedError(" [!] GST inference not implemented for TfLite") - if truncated: - raise NotImplementedError(" [!] Truncated inference not implemented for TfLite") + raise NotImplementedError( + " [!] GST inference not implemented for TfLite") if speaker_id is not None: - raise NotImplementedError(" [!] Multi-Speaker not implemented for TfLite") + raise NotImplementedError( + " [!] Multi-Speaker not implemented for TfLite") # get input and output details input_details = model.get_input_details() output_details = model.get_output_details() @@ -152,9 +153,11 @@ def run_model_tflite(model, inputs, CONFIG, truncated, speaker_id=None, style_me return decoder_output, postnet_output, None, None -def parse_outputs_torch(postnet_output, decoder_output, alignments, stop_tokens): +def parse_outputs_torch(postnet_output, decoder_output, alignments, + stop_tokens): postnet_output = postnet_output[0].data.cpu().numpy() - decoder_output = None if decoder_output is None else decoder_output[0].data.cpu().numpy() + decoder_output = None if decoder_output is None else decoder_output[ + 0].data.cpu().numpy() alignment = alignments[0].cpu().data.numpy() stop_tokens = None if stop_tokens is None else stop_tokens[0].cpu().numpy() return postnet_output, decoder_output, alignment, stop_tokens @@ -175,7 +178,7 @@ def parse_outputs_tflite(postnet_output, decoder_output): def trim_silence(wav, ap): - return wav[: ap.find_endpoint(wav)] + return wav[:ap.find_endpoint(wav)] def inv_spectrogram(postnet_output, ap, CONFIG): @@ -186,23 +189,23 @@ def inv_spectrogram(postnet_output, ap, CONFIG): return wav -def id_to_torch(speaker_id, cuda=False): +def speaker_id_to_torch(speaker_id, cuda=False): if speaker_id is not None: speaker_id = np.asarray(speaker_id) - # TODO: test this for tacotron models speaker_id = torch.from_numpy(speaker_id) if cuda: return speaker_id.cuda() return speaker_id -def embedding_to_torch(speaker_embedding, cuda=False): - if speaker_embedding is not None: - speaker_embedding = np.asarray(speaker_embedding) - speaker_embedding = torch.from_numpy(speaker_embedding).unsqueeze(0).type(torch.FloatTensor) +def embedding_to_torch(x_vector, cuda=False): + if x_vector is not None: + x_vector = np.asarray(x_vector) + x_vector = torch.from_numpy(x_vector).unsqueeze( + 0).type(torch.FloatTensor) if cuda: - return speaker_embedding.cuda() - return speaker_embedding + return x_vector.cuda() + return x_vector # TODO: perform GL with pytorch for batching @@ -216,7 +219,8 @@ def apply_griffin_lim(inputs, input_lens, CONFIG, ap): """ wavs = [] for idx, spec in enumerate(inputs): - wav_len = (input_lens[idx] * ap.hop_length) - ap.hop_length # inverse librosa padding + wav_len = (input_lens[idx] * + ap.hop_length) - ap.hop_length # inverse librosa padding wav = inv_spectrogram(spec, ap, CONFIG) # assert len(wav) == wav_len, f" [!] wav lenght: {len(wav)} vs expected: {wav_len}" wavs.append(wav[:wav_len]) @@ -231,11 +235,10 @@ def synthesis( ap, speaker_id=None, style_wav=None, - truncated=False, enable_eos_bos_chars=False, # pylint: disable=unused-argument use_griffin_lim=False, do_trim_silence=False, - speaker_embedding=None, + x_vector=None, backend="torch", ): """Synthesize voice for the given text. @@ -249,8 +252,6 @@ def synthesis( model outputs. speaker_id (int): id of speaker style_wav (str | Dict[str, float]): Uses for style embedding of GST. - truncated (bool): keep model states after inference. It can be used - for continuous inference at long texts. enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence. do_trim_silence (bool): trim silence after synthesis. backend (str): tf or torch @@ -263,14 +264,15 @@ def synthesis( else: style_mel = compute_style_mel(style_wav, ap, cuda=use_cuda) # preprocess the given text - inputs = text_to_seqvec(text, CONFIG) + inputs = text_to_seq(text, CONFIG) # pass tensors to backend if backend == "torch": if speaker_id is not None: - speaker_id = id_to_torch(speaker_id, cuda=use_cuda) + speaker_id = speaker_id_to_torch(speaker_id, cuda=use_cuda) - if speaker_embedding is not None: - speaker_embedding = embedding_to_torch(speaker_embedding, cuda=use_cuda) + if x_vector is not None: + x_vector = embedding_to_torch(x_vector, + cuda=use_cuda) if not isinstance(style_mel, dict): style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) @@ -287,24 +289,26 @@ def synthesis( inputs = tf.expand_dims(inputs, 0) # synthesize voice if backend == "torch": - decoder_output, postnet_output, alignments, stop_tokens = run_model_torch( - model, inputs, CONFIG, truncated, speaker_id, style_mel, speaker_embeddings=speaker_embedding - ) + outputs = run_model_torch(model, + inputs, + speaker_id, + style_mel, + x_vector=x_vector) + postnet_output, decoder_output, alignments, stop_tokens = \ + outputs['postnet_outputs'], outputs['decoder_outputs'],\ + outputs['alignments'], outputs['stop_tokens'] postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_torch( - postnet_output, decoder_output, alignments, stop_tokens - ) + postnet_output, decoder_output, alignments, stop_tokens) elif backend == "tf": decoder_output, postnet_output, alignments, stop_tokens = run_model_tf( - model, inputs, CONFIG, truncated, speaker_id, style_mel - ) + model, inputs, CONFIG, speaker_id, style_mel) postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_tf( - postnet_output, decoder_output, alignments, stop_tokens - ) + postnet_output, decoder_output, alignments, stop_tokens) elif backend == "tflite": decoder_output, postnet_output, alignment, stop_tokens = run_model_tflite( - model, inputs, CONFIG, truncated, speaker_id, style_mel - ) - postnet_output, decoder_output = parse_outputs_tflite(postnet_output, decoder_output) + model, inputs, CONFIG, speaker_id, style_mel) + postnet_output, decoder_output = parse_outputs_tflite( + postnet_output, decoder_output) # convert outputs to numpy # plot results wav = None From 178eccbc169de0e4b55100058ba4277aa9fe405b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 14:41:37 +0200 Subject: [PATCH 127/258] update console logger --- TTS/utils/console_logger.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/TTS/utils/console_logger.py b/TTS/utils/console_logger.py index 7d6e1968..bb6644c9 100644 --- a/TTS/utils/console_logger.py +++ b/TTS/utils/console_logger.py @@ -68,11 +68,10 @@ class ConsoleLogger: print(log_text, flush=True) def print_eval_start(self): - print(f"{tcolors.BOLD} > EVALUATION {tcolors.ENDC}\n") + print(f"\n{tcolors.BOLD} > EVALUATION {tcolors.ENDC}\n") def print_eval_step(self, step, loss_dict, avg_loss_dict): indent = " | > " - print() log_text = f"{tcolors.BOLD} --> STEP: {step}{tcolors.ENDC}\n" for key, value in loss_dict.items(): # print the avg value if given @@ -84,7 +83,7 @@ class ConsoleLogger: def print_epoch_end(self, epoch, avg_loss_dict): indent = " | > " - log_text = " {}--> EVAL PERFORMANCE{}\n".format(tcolors.BOLD, tcolors.ENDC) + log_text = "\n {}--> EVAL PERFORMANCE{}\n".format(tcolors.BOLD, tcolors.ENDC) for key, value in avg_loss_dict.items(): # print the avg value if given color = "" From 34f8a74e4d264d23b5f32942b185f6e997ebd6b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 14:41:56 +0200 Subject: [PATCH 128/258] remove `truncated` from synthesizer --- TTS/utils/synthesizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index bca3df31..5962950f 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -230,7 +230,6 @@ class Synthesizer(object): ap=self.ap, speaker_id=None, style_wav=style_wav, - truncated=False, enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars, use_griffin_lim=use_gl, speaker_embedding=speaker_embedding, From 5f07315722b57199db97cdd5b069c4455f142337 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 14:45:27 +0200 Subject: [PATCH 129/258] add trainer and train_tts --- TTS/bin/train_tts.py | 28 ++ TTS/trainer.py | 756 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 784 insertions(+) create mode 100644 TTS/bin/train_tts.py create mode 100644 TTS/trainer.py diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py new file mode 100644 index 00000000..5058d341 --- /dev/null +++ b/TTS/bin/train_tts.py @@ -0,0 +1,28 @@ +import os +import sys +import traceback +from TTS.utils.arguments import init_training +from TTS.utils.generic_utils import remove_experiment_folder +from TTS.trainer import TrainerTTS + + +def main(): + # try: + args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training( + sys.argv) + trainer = TrainerTTS(args, config, c_logger, tb_logger, output_path=OUT_PATH) + trainer.fit() + # except KeyboardInterrupt: + # remove_experiment_folder(OUT_PATH) + # try: + # sys.exit(0) + # except SystemExit: + # os._exit(0) # pylint: disable=protected-access + # except Exception: # pylint: disable=broad-except + # remove_experiment_folder(OUT_PATH) + # traceback.print_exc() + # sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/TTS/trainer.py b/TTS/trainer.py new file mode 100644 index 00000000..cfb72191 --- /dev/null +++ b/TTS/trainer.py @@ -0,0 +1,756 @@ +# -*- coding: utf-8 -*- + +import os +import sys +import time +import traceback +from random import randrange +import logging +import importlib + +import numpy as np +import torch + +# DISTRIBUTED +from torch.nn.parallel import DistributedDataParallel as DDP_th +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler + +from TTS.tts.datasets import load_meta_data, TTSDataset +from TTS.tts.layers import setup_loss +from TTS.tts.models import setup_model +from TTS.tts.utils.io import save_best_model, save_checkpoint +from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.synthesis import synthesis +from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols +from TTS.utils.arguments import init_training +from TTS.tts.utils.visual import plot_spectrogram, plot_alignment +from TTS.utils.audio import AudioProcessor +from TTS.utils.distribute import init_distributed, reduce_tensor +from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict, find_module +from TTS.utils.training import setup_torch_training_env, check_update + + +@dataclass +class TrainingArgs(Coqpit): + continue_path: str = field( + default='', + metadata={ + 'help': + 'Path to a training folder to continue training. Restore the model from the last checkpoint and continue training under the same folder.' + }) + restore_path: str = field( + default='', + metadata={ + 'help': + 'Path to a model checkpoit. Restore the model with the given checkpoint and start a new training.' + }) + best_path: str = field( + default='', + metadata={ + 'help': + "Best model file to be used for extracting best loss. If not specified, the latest best model in continue path is used" + }) + config_path: str = field( + default='', metadata={'help': 'Path to the configuration file.'}) + rank: int = field( + default=0, metadata={'help': 'Process rank in distributed training.'}) + group_id: str = field( + default='', + metadata={'help': 'Process group id in distributed training.'}) + + +# pylint: disable=import-outside-toplevel, too-many-public-methods +class TrainerTTS: + use_cuda, num_gpus = setup_torch_training_env(True, False) + + def __init__(self, + args, + config, + c_logger, + tb_logger, + model=None, + output_path=None): + self.args = args + self.config = config + self.c_logger = c_logger + self.tb_logger = tb_logger + self.output_path = output_path + + self.total_steps_done = 0 + self.epochs_done = 0 + self.restore_step = 0 + self.best_loss = float("inf") + self.train_loader = None + self.eval_loader = None + self.output_audio_path = os.path.join(output_path, 'test_audios') + + self.keep_avg_train = None + self.keep_avg_eval = None + + # model, audio processor, datasets, loss + # init audio processor + self.ap = AudioProcessor(**config.audio.to_dict()) + + # init character processor + self.model_characters = self.init_character_processor() + + # load dataset samples + self.data_train, self.data_eval = load_meta_data(config.datasets) + + # default speaker manager + self.speaker_manager = self.init_speaker_manager() + + # init TTS model + if model is not None: + self.model = model + else: + self.model = self.init_model() + + # setup criterion + self.criterion = self.init_criterion() + + # DISTRUBUTED + if self.num_gpus > 1: + init_distributed(args.rank, self.num_gpus, args.group_id, + config.distributed["backend"], + config.distributed["url"]) + + # scalers for mixed precision training + self.scaler = torch.cuda.amp.GradScaler( + ) if config.mixed_precision else None + + # setup optimizer + self.optimizer = self.init_optimizer(self.model) + + # setup scheduler + self.scheduler = self.init_scheduler(self.config, self.optimizer) + + if self.args.restore_path: + self.model, self.optimizer, self.scaler, self.restore_step = self.restore_model( + self.config, args.restore_path, self.model, self.optimizer, + self.scaler) + + if self.use_cuda: + self.model.cuda() + self.criterion.cuda() + + # DISTRUBUTED + if self.num_gpus > 1: + self.model = DDP_th(self.model, device_ids=[args.rank]) + + # count model size + num_params = count_parameters(self.model) + logging.info("\n > Model has {} parameters".format(num_params), + flush=True) + + def init_model(self): + model = setup_model( + len(self.model_characters), + self.speaker_manager.num_speakers, + self.config, + self.speaker_manager.x_vector_dim + if self.speaker_manager.x_vectors else None, + ) + return model + + def init_optimizer(self, model): + optimizer_name = self.config.optimizer + optimizer_params = self.config.optimizer_params + if optimizer_name.lower() == "radam": + module = importlib.import_module("TTS.utils.radam") + optimizer = getattr(module, "RAdam") + else: + optimizer = getattr(torch.optim, optimizer_name) + return optimizer(model.parameters(), + lr=self.config.lr, + **optimizer_params) + + def init_character_processor(self): + # setup custom characters if set in config file. + # TODO: implement CharacterProcessor + if self.config.characters is not None: + symbols, phonemes = make_symbols( + **self.config.characters.to_dict()) + else: + from TTS.tts.utils.text.symbols import symbols, phonemes + model_characters = phonemes if self.config.use_phonemes else symbols + return model_characters + + def init_speaker_manager(self, restore_path: str = "", out_path: str = ""): + speaker_manager = SpeakerManager() + if restore_path: + speakers_file = os.path.join(os.path.dirname(restore_path), + "speaker.json") + if not os.path.exists(speakers_file): + logging.info( + "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" + ) + speakers_file = self.config.external_speaker_embedding_file + + if self.config.use_external_speaker_embedding_file: + speaker_manager.load_x_vectors_file(speakers_file) + else: + self.speaker_manage.load_speaker_mapping(speakers_file) + elif self.config.use_external_speaker_embedding_file and self.config.external_speaker_embedding_file: + speaker_manager.load_x_vectors_file( + self.config.external_speaker_embedding_file) + else: + speaker_manager.parse_speakers_from_items(self.data_train) + file_path = os.path.join(out_path, "speakers.json") + speaker_manager.save_ids_file(file_path) + return speaker_manager + + def init_scheduler(self, config, optimizer): + lr_scheduler = config.lr_scheduler + lr_scheduler_params = config.lr_scheduler_params + if lr_scheduler is None: + return None + if lr_scheduler.lower() == "noamlr": + from TTS.utils.training import NoamLR + scheduler = NoamLR + else: + scheduler = getattr(torch.optim, lr_scheduler) + return scheduler(optimizer, **lr_scheduler_params) + + def init_criterion(self): + return setup_loss(self.config) + + def restore_model(self, + config, + restore_path, + model, + optimizer, + scaler=None): + logging.info(f" > Restoring from {os.path.basename(restore_path)}...") + checkpoint = torch.load(restore_path, map_location="cpu") + try: + logging.info(" > Restoring Model...") + model.load_state_dict(checkpoint["model"]) + # optimizer restore + logging.info(" > Restoring Optimizer...") + optimizer.load_state_dict(checkpoint["optimizer"]) + if "scaler" in checkpoint and config.mixed_precision: + logging.info(" > Restoring AMP Scaler...") + scaler.load_state_dict(checkpoint["scaler"]) + except (KeyError, RuntimeError): + logging.info(" > Partial model initialization...") + model_dict = model.state_dict() + model_dict = set_init_dict(model_dict, checkpoint["model"], config) + model.load_state_dict(model_dict) + del model_dict + + for group in optimizer.param_groups: + group["lr"] = self.config.lr + logging.info(" > Model restored from step %d" % checkpoint["step"], + flush=True) + restore_step = checkpoint["step"] + return model, optimizer, scaler, restore_step + + def _setup_loader(self, r, ap, is_eval, data_items, verbose, + speaker_mapping): + if is_eval and not self.config.run_eval: + loader = None + else: + dataset = TTSDataset( + outputs_per_step=r, + text_cleaner=self.config.text_cleaner, + compute_linear_spec= 'tacotron' == self.config.model.lower(), + meta_data=data_items, + ap=ap, + tp=self.config.characters, + add_blank=self.config["add_blank"], + batch_group_size=0 if is_eval else + self.config.batch_group_size * self.config.batch_size, + min_seq_len=self.config.min_seq_len, + max_seq_len=self.config.max_seq_len, + phoneme_cache_path=self.config.phoneme_cache_path, + use_phonemes=self.config.use_phonemes, + phoneme_language=self.config.phoneme_language, + enable_eos_bos=self.config.enable_eos_bos_chars, + use_noise_augment=not is_eval, + verbose=verbose, + speaker_mapping=speaker_mapping + if self.config.use_speaker_embedding + and self.config.use_external_speaker_embedding_file else None, + ) + + if self.config.use_phonemes and self.config.compute_input_seq_cache: + # precompute phonemes to have a better estimate of sequence lengths. + dataset.compute_input_seq(self.config.num_loader_workers) + dataset.sort_items() + + sampler = DistributedSampler( + dataset) if self.num_gpus > 1 else None + loader = DataLoader( + dataset, + batch_size=self.config.eval_batch_size + if is_eval else self.config.batch_size, + shuffle=False, + collate_fn=dataset.collate_fn, + drop_last=False, + sampler=sampler, + num_workers=self.config.num_val_loader_workers + if is_eval else self.config.num_loader_workers, + pin_memory=False, + ) + return loader + + def setup_train_dataloader(self, r, ap, data_items, verbose, + speaker_mapping): + return self._setup_loader(r, ap, False, data_items, verbose, + speaker_mapping) + + def setup_eval_dataloder(self, r, ap, data_items, verbose, + speaker_mapping): + return self._setup_loader(r, ap, True, data_items, verbose, + speaker_mapping) + + def format_batch(self, batch): + # setup input batch + text_input = batch[0] + text_lengths = batch[1] + speaker_names = batch[2] + linear_input = batch[3] if self.config.model.lower() in ["tacotron" + ] else None + mel_input = batch[4] + mel_lengths = batch[5] + stop_targets = batch[6] + item_idx = batch[7] + speaker_embeddings = batch[8] + attn_mask = batch[9] + max_text_length = torch.max(text_lengths.float()) + max_spec_length = torch.max(mel_lengths.float()) + + # convert speaker names to ids + if self.config.use_speaker_embedding: + if self.config.use_external_speaker_embedding_file: + speaker_embeddings = batch[8] + speaker_ids = None + else: + speaker_ids = [ + self.speaker_manager.speaker_ids[speaker_name] + for speaker_name in speaker_names + ] + speaker_ids = torch.LongTensor(speaker_ids) + speaker_embeddings = None + else: + speaker_embeddings = None + speaker_ids = None + + # compute durations from attention masks + if attn_mask is not None: + durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2]) + for idx, am in enumerate(attn_mask): + # compute raw durations + c_idxs = am[:, :text_lengths[idx], :mel_lengths[idx]].max(1)[1] + # c_idxs, counts = torch.unique_consecutive(c_idxs, return_counts=True) + c_idxs, counts = torch.unique(c_idxs, return_counts=True) + dur = torch.ones([text_lengths[idx]]).to(counts.dtype) + dur[c_idxs] = counts + # smooth the durations and set any 0 duration to 1 + # by cutting off from the largest duration indeces. + extra_frames = dur.sum() - mel_lengths[idx] + largest_idxs = torch.argsort(-dur)[:extra_frames] + dur[largest_idxs] -= 1 + assert ( + dur.sum() == mel_lengths[idx] + ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + durations[idx, :text_lengths[idx]] = dur + + # set stop targets view, we predict a single stop token per iteration. + stop_targets = stop_targets.view(text_input.shape[0], + stop_targets.size(1) // self.config.r, + -1) + stop_targets = (stop_targets.sum(2) > + 0.0).unsqueeze(2).float().squeeze(2) + + # dispatch batch to GPU + if self.use_cuda: + text_input = text_input.cuda(non_blocking=True) + text_lengths = text_lengths.cuda(non_blocking=True) + mel_input = mel_input.cuda(non_blocking=True) + mel_lengths = mel_lengths.cuda(non_blocking=True) + linear_input = linear_input.cuda( + non_blocking=True) if self.config.model.lower() in [ + "tacotron" + ] else None + stop_targets = stop_targets.cuda(non_blocking=True) + attn_mask = attn_mask.cuda( + non_blocking=True) if attn_mask is not None else None + durations = durations.cuda( + non_blocking=True) if attn_mask is not None else None + if speaker_ids is not None: + speaker_ids = speaker_ids.cuda(non_blocking=True) + if speaker_embeddings is not None: + speaker_embeddings = speaker_embeddings.cuda(non_blocking=True) + + return { + "text_input": text_input, + "text_lengths": text_lengths, + "mel_input": mel_input, + "mel_lengths": mel_lengths, + "linear_input": linear_input, + "stop_targets": stop_targets, + "attn_mask": attn_mask, + "durations": durations, + "speaker_ids": speaker_ids, + "x_vectors": speaker_embeddings, + "max_text_length": max_text_length, + "max_spec_length": max_spec_length, + "item_idx": item_idx + } + + def train_step(self, batch, batch_n_steps, step, loader_start_time): + self.on_train_step_start() + step_start_time = time.time() + + # format data + batch = self.format_batch(batch) + loader_time = time.time() - loader_start_time + + # zero-out optimizer + self.optimizer.zero_grad() + + with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): + outputs, loss_dict = self.model.train_step(batch, self.criterion) + + # check nan loss + if torch.isnan(loss_dict["loss"]).any(): + raise RuntimeError( + f"Detected NaN loss at step {self.total_steps_done}.") + + # optimizer step + if self.config.mixed_precision: + # model optimizer step in mixed precision mode + self.scaler.scale(loss_dict["loss"]).backward() + self.scaler.unscale_(self.optimizer) + grad_norm, _ = check_update(self.model, + self.config.grad_clip, + ignore_stopnet=True) + self.scaler.step(self.optimizer) + self.scaler.update() + else: + # main model optimizer step + loss_dict["loss"].backward() + grad_norm, _ = check_update(self.model, + self.config.grad_clip, + ignore_stopnet=True) + self.optimizer.step() + + step_time = time.time() - step_start_time + + # setup lr + if self.config.lr_scheduler: + self.scheduler.step() + + # detach loss values + loss_dict_new = dict() + for key, value in loss_dict.items(): + if isinstance(value, (int, float)): + loss_dict_new[key] = value + else: + loss_dict_new[key] = value.item() + loss_dict = loss_dict_new + + # update avg stats + update_train_values = dict() + for key, value in loss_dict.items(): + update_train_values["avg_" + key] = value + update_train_values["avg_loader_time"] = loader_time + update_train_values["avg_step_time"] = step_time + self.keep_avg_train.update_values(update_train_values) + + # print training progress + current_lr = self.optimizer.param_groups[0]["lr"] + if self.total_steps_done % self.config.print_step == 0: + log_dict = { + "max_spec_length": [batch["max_spec_length"], + 1], # value, precision + "max_text_length": [batch["max_text_length"], 1], + "step_time": [step_time, 4], + "loader_time": [loader_time, 2], + "current_lr": current_lr, + } + self.c_logger.print_train_step(batch_n_steps, step, + self.total_steps_done, log_dict, + loss_dict, + self.keep_avg_train.avg_values) + + if self.args.rank == 0: + # Plot Training Iter Stats + # reduce TB load + if self.total_steps_done % self.config.tb_plot_step == 0: + iter_stats = { + "lr": current_lr, + "grad_norm": grad_norm, + "step_time": step_time, + } + iter_stats.update(loss_dict) + self.tb_logger.tb_train_step_stats(self.total_steps_done, + iter_stats) + + if self.total_steps_done % self.config.save_step == 0: + if self.config.checkpoint: + # save model + save_checkpoint( + self.model, + self.optimizer, + self.total_steps_done, + self.epochs_done, + self.config.r, + self.output_path, + model_loss=loss_dict["loss"], + characters=self.model_characters, + scaler=self.scaler.state_dict() + if self.config.mixed_precision else None, + ) + # training visualizations + figures, audios = self.model.train_log(self.ap, batch, outputs) + self.tb_logger.tb_train_figures(self.total_steps_done, figures) + self.tb_logger.tb_train_audios(self.total_steps_done, + {"TrainAudio": audios}, + self.ap.sample_rate) + self.total_steps_done += 1 + self.on_train_step_end() + return outputs, loss_dict + + def train_epoch(self): + self.model.train() + epoch_start_time = time.time() + if self.use_cuda: + batch_num_steps = int( + len(self.train_loader.dataset) / + (self.config.batch_size * self.num_gpus)) + else: + batch_num_steps = int( + len(self.train_loader.dataset) / self.config.batch_size) + self.c_logger.print_train_start() + loader_start_time = time.time() + for cur_step, batch in enumerate(self.train_loader): + _, _ = self.train_step(batch, batch_num_steps, cur_step, + loader_start_time) + epoch_time = time.time() - epoch_start_time + # Plot self.epochs_done Stats + if self.args.rank == 0: + epoch_stats = {"epoch_time": epoch_time} + epoch_stats.update(self.keep_avg_train.avg_values) + self.tb_logger.tb_train_epoch_stats(self.total_steps_done, + epoch_stats) + if self.config.tb_model_param_stats: + self.tb_logger.tb_model_weights(self.model, + self.total_steps_done) + + def eval_step(self, batch, step): + with torch.no_grad(): + step_start_time = time.time() + + with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): + outputs, loss_dict = self.model.eval_step( + batch, self.criterion) + + step_time = time.time() - step_start_time + + # detach loss values + loss_dict_new = dict() + for key, value in loss_dict.items(): + if isinstance(value, (int, float)): + loss_dict_new[key] = value + else: + loss_dict_new[key] = value.item() + loss_dict = loss_dict_new + + # update avg stats + update_eval_values = dict() + for key, value in loss_dict.items(): + update_eval_values["avg_" + key] = value + update_eval_values["avg_step_time"] = step_time + self.keep_avg_eval.update_values(update_eval_values) + + if self.config.print_eval: + self.c_logger.print_eval_step(step, loss_dict, + self.keep_avg_eval.avg_values) + return outputs, loss_dict + + def eval_epoch(self): + self.model.eval() + if self.use_cuda: + batch_num_steps = int( + len(self.train_loader.dataset) / + (self.config.batch_size * self.num_gpus)) + else: + batch_num_steps = int( + len(self.train_loader.dataset) / self.config.batch_size) + self.c_logger.print_eval_start() + loader_start_time = time.time() + for cur_step, batch in enumerate(self.eval_loader): + # format data + batch = self.format_batch(batch) + loader_time = time.time() - loader_start_time + self.keep_avg_eval.update_values({'avg_loader_time': loader_time}) + outputs, _ = self.eval_step(batch, cur_step) + # Plot epoch stats and samples from the last batch. + if self.args.rank == 0: + figures, eval_audios = self.model.eval_log(self.ap, batch, outputs) + self.tb_logger.tb_eval_figures(self.total_steps_done, figures) + self.tb_logger.tb_eval_audios(self.total_steps_done, + {"EvalAudio": eval_audios}, + self.ap.sample_rate) + + def test_run(self, ): + logging.info(" | > Synthesizing test sentences.") + test_audios = {} + test_figures = {} + test_sentences = self.config.test_sentences + cond_inputs = self._get_cond_inputs() + for idx, sen in enumerate(test_sentences): + wav, alignment, model_outputs, _ = synthesis( + self.model, + sen, + self.config, + self.use_cuda, + self.ap, + speaker_id=cond_inputs['speaker_id'], + x_vector=cond_inputs['x_vector'], + style_wav=cond_inputs['style_wav'], + enable_eos_bos_chars=self.config.enable_eos_bos_chars, + use_griffin_lim=True, + do_trim_silence=False, + ).values() + + file_path = os.path.join(self.output_audio_path, str(self.total_steps_done)) + os.makedirs(file_path, exist_ok=True) + file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) + self.ap.save_wav(wav, file_path) + test_audios["{}-audio".format(idx)] = wav + test_figures["{}-prediction".format(idx)] = plot_spectrogram( + model_outputs, self.ap, output_fig=False) + test_figures["{}-alignment".format(idx)] = plot_alignment( + alignment, output_fig=False) + + self.tb_logger.tb_test_audios(self.total_steps_done, test_audios, + self.config.audio["sample_rate"]) + self.tb_logger.tb_test_figures(self.total_steps_done, test_figures) + + def _get_cond_inputs(self): + # setup speaker_id + speaker_id = 0 if self.config.use_speaker_embedding else None + # setup x_vector + x_vector = self.speaker_manager.get_x_vectors_by_speaker( + self.speaker_manager.speaker_ids[0] + ) if self.config.use_external_speaker_embedding_file and self.config.use_speaker_embedding else None + # setup style_mel + if self.config.has('gst_style_input'): + style_wav = self.config.gst_style_input + else: + style_wav = None + if style_wav is None and 'use_gst' in self.config and self.config.use_gst: + # inicialize GST with zero dict. + style_wav = {} + print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") + for i in range(self.config.gst["gst_num_style_tokens"]): + style_wav[str(i)] = 0 + cond_inputs = {'speaker_id': speaker_id, 'style_wav': style_wav, 'x_vector': x_vector} + return cond_inputs + + def fit(self): + if self.restore_step != 0 or self.args.best_path: + logging.info(" > Restoring best loss from " + f"{os.path.basename(self.args.best_path)} ...") + self.best_loss = torch.load(self.args.best_path, + map_location="cpu")["model_loss"] + logging.info( + f" > Starting with loaded last best loss {self.best_loss}.") + + # define data loaders + self.train_loader = self.setup_train_dataloader( + self.config.r, + self.ap, + self.data_train, + verbose=True, + speaker_mapping=self.speaker_manager.speaker_ids) + self.eval_loader = self.setup_eval_dataloder( + self.config.r, + self.ap, + self.data_train, + verbose=True, + speaker_mapping=self.speaker_manager.speaker_ids + ) if self.config.run_eval else None + + self.total_steps_done = self.restore_step + + for epoch in range(0, self.config.epochs): + self.on_epoch_start() + self.keep_avg_train = KeepAverage() + self.keep_avg_eval = KeepAverage( + ) if self.config.run_eval else None + self.epochs_done = epoch + self.c_logger.print_epoch_start(epoch, self.config.epochs) + self.train_epoch() + if self.config.run_eval: + self.eval_epoch() + if epoch >= self.config.test_delay_epochs: + self.test_run() + self.c_logger.print_epoch_end( + epoch, self.keep_avg_eval.avg_values + if self.config.run_eval else self.keep_avg_train.avg_values) + self.save_best_model() + self.on_epoch_end() + + def save_best_model(self): + self.best_loss = save_best_model( + self.keep_avg_eval['avg_loss'] + if self.keep_avg_eval else self.keep_avg_train['avg_loss'], + self.best_loss, + self.model, + self.optimizer, + self.total_steps_done, + self.epochs_done, + self.config.r, + self.output_path, + self.model_characters, + keep_all_best=self.config.keep_all_best, + keep_after=self.config.keep_after, + scaler=self.scaler.state_dict() + if self.config.mixed_precision else None, + ) + + def on_epoch_start(self): + if hasattr(self.model, 'on_epoch_start'): + self.model.on_epoch_start(self) + + if hasattr(self.criterion, "on_epoch_start"): + self.criterion.on_epoch_start(self) + + if hasattr(self.optimizer, "on_epoch_start"): + self.optimizer.on_epoch_start(self) + + def on_epoch_end(self): + if hasattr(self.model, "on_epoch_start"): + self.model.on_epoch_end(self) + + if hasattr(self.criterion, "on_epoch_end"): + self.criterion.on_epoch_end(self) + + if hasattr(self.optimizer, "on_epoch_end"): + self.optimizer.on_epoch_end(self) + + def on_train_step_start(self): + if hasattr(self.model, "on_epoch_start"): + self.model.on_train_step_start(self) + + if hasattr(self.criterion, "on_train_step_start"): + self.criterion.on_train_step_start(self) + + if hasattr(self.optimizer, "on_train_step_start"): + self.optimizer.on_train_step_start(self) + + def on_train_step_end(self): + if hasattr(self.model, "on_train_step_end"): + self.model.on_train_step_end(self) + + if hasattr(self.criterion, "on_train_step_end"): + self.criterion.on_train_step_end(self) + + if hasattr(self.optimizer, "on_train_step_end"): + self.optimizer.on_train_step_end(self) From 891631ab47fa905dff5f86b5ca95c437eb0ae265 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 May 2021 13:33:58 +0200 Subject: [PATCH 130/258] typing annotation for the trainer --- TTS/trainer.py | 275 +++++++++++++++++++++++++++---------------------- 1 file changed, 151 insertions(+), 124 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index cfb72191..3beb281f 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -65,12 +65,12 @@ class TrainerTTS: use_cuda, num_gpus = setup_torch_training_env(True, False) def __init__(self, - args, - config, - c_logger, - tb_logger, - model=None, - output_path=None): + args: Union[Coqpit, Namespace], + config: Coqpit, + c_logger: ConsoleLogger, + tb_logger: TensorboardLogger, + model: nn.Module = None, + output_path: str = None) -> None: self.args = args self.config = config self.c_logger = c_logger @@ -88,43 +88,52 @@ class TrainerTTS: self.keep_avg_train = None self.keep_avg_eval = None + log_file = os.path.join(self.output_path, + f"trainer_{args.rank}_log.txt") + self._setup_logger_config(log_file) + # model, audio processor, datasets, loss # init audio processor - self.ap = AudioProcessor(**config.audio.to_dict()) + self.ap = AudioProcessor(**self.config.audio.to_dict()) # init character processor - self.model_characters = self.init_character_processor() + self.model_characters = self.get_character_processor(self.config) # load dataset samples - self.data_train, self.data_eval = load_meta_data(config.datasets) + self.data_train, self.data_eval = load_meta_data(self.config.datasets) # default speaker manager - self.speaker_manager = self.init_speaker_manager() + self.speaker_manager = self.get_speaker_manager( + self.config, args.restore_path, self.config.output_path, self.data_train) # init TTS model if model is not None: self.model = model else: - self.model = self.init_model() + self.model = self.get_model( + len(self.model_characters), self.speaker_manager.num_speakers, + self.config, self.speaker_manager.x_vector_dim + if self.speaker_manager.x_vectors else None) # setup criterion - self.criterion = self.init_criterion() + self.criterion = self.get_criterion(self.config) + + if self.use_cuda: + self.model.cuda() + self.criterion.cuda() # DISTRUBUTED if self.num_gpus > 1: init_distributed(args.rank, self.num_gpus, args.group_id, - config.distributed["backend"], - config.distributed["url"]) + self.config.distributed["backend"], + self.config.distributed["url"]) # scalers for mixed precision training self.scaler = torch.cuda.amp.GradScaler( - ) if config.mixed_precision else None + ) if self.config.mixed_precision and self.use_cuda else None # setup optimizer - self.optimizer = self.init_optimizer(self.model) - - # setup scheduler - self.scheduler = self.init_scheduler(self.config, self.optimizer) + self.optimizer = self.get_optimizer(self.model, self.config) if self.args.restore_path: self.model, self.optimizer, self.scaler, self.restore_step = self.restore_model( @@ -144,64 +153,66 @@ class TrainerTTS: logging.info("\n > Model has {} parameters".format(num_params), flush=True) - def init_model(self): - model = setup_model( - len(self.model_characters), - self.speaker_manager.num_speakers, - self.config, - self.speaker_manager.x_vector_dim - if self.speaker_manager.x_vectors else None, - ) + @staticmethod + def get_model(num_chars: int, num_speakers: int, config: Coqpit, + x_vector_dim: int) -> nn.Module: + model = setup_model(num_chars, num_speakers, config, x_vector_dim) return model - def init_optimizer(self, model): - optimizer_name = self.config.optimizer - optimizer_params = self.config.optimizer_params + @staticmethod + def get_optimizer(model: nn.Module, config: Coqpit) -> torch.optim.Optimizer: + optimizer_name = config.optimizer + optimizer_params = config.optimizer_params if optimizer_name.lower() == "radam": module = importlib.import_module("TTS.utils.radam") optimizer = getattr(module, "RAdam") else: optimizer = getattr(torch.optim, optimizer_name) - return optimizer(model.parameters(), - lr=self.config.lr, - **optimizer_params) + return optimizer(model.parameters(), lr=config.lr, **optimizer_params) - def init_character_processor(self): + @staticmethod + def get_character_processor(config: Coqpit) -> str: # setup custom characters if set in config file. # TODO: implement CharacterProcessor - if self.config.characters is not None: - symbols, phonemes = make_symbols( - **self.config.characters.to_dict()) + if config.characters is not None: + symbols, phonemes = make_symbols(**config.characters.to_dict()) else: - from TTS.tts.utils.text.symbols import symbols, phonemes - model_characters = phonemes if self.config.use_phonemes else symbols + from TTS.tts.utils.text.symbols import phonemes, symbols + model_characters = phonemes if config.use_phonemes else symbols return model_characters - def init_speaker_manager(self, restore_path: str = "", out_path: str = ""): + @staticmethod + def get_speaker_manager(config: Coqpit, + restore_path: str = "", + out_path: str = "", + data_train: List = []) -> SpeakerManager: speaker_manager = SpeakerManager() - if restore_path: - speakers_file = os.path.join(os.path.dirname(restore_path), - "speaker.json") - if not os.path.exists(speakers_file): - logging.info( - "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" - ) - speakers_file = self.config.external_speaker_embedding_file + if config.use_speaker_embedding: + if restore_path: + speakers_file = os.path.join(os.path.dirname(restore_path), + "speaker.json") + if not os.path.exists(speakers_file): + print( + "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" + ) + speakers_file = config.external_speaker_embedding_file - if self.config.use_external_speaker_embedding_file: - speaker_manager.load_x_vectors_file(speakers_file) + if config.use_external_speaker_embedding_file: + speaker_manager.load_x_vectors_file(speakers_file) + else: + speaker_manager.load_ids_file(speakers_file) + elif config.use_external_speaker_embedding_file and config.external_speaker_embedding_file: + speaker_manager.load_x_vectors_file( + config.external_speaker_embedding_file) else: - self.speaker_manage.load_speaker_mapping(speakers_file) - elif self.config.use_external_speaker_embedding_file and self.config.external_speaker_embedding_file: - speaker_manager.load_x_vectors_file( - self.config.external_speaker_embedding_file) - else: - speaker_manager.parse_speakers_from_items(self.data_train) - file_path = os.path.join(out_path, "speakers.json") - speaker_manager.save_ids_file(file_path) + speaker_manager.parse_speakers_from_items(data_train) + file_path = os.path.join(out_path, "speakers.json") + speaker_manager.save_ids_file(file_path) return speaker_manager - def init_scheduler(self, config, optimizer): + @staticmethod + def get_scheduler(config: Coqpit, + optimizer: torch.optim.Optimizer) -> torch.optim.lr_scheduler._LRScheduler: lr_scheduler = config.lr_scheduler lr_scheduler_params = config.lr_scheduler_params if lr_scheduler is None: @@ -213,17 +224,20 @@ class TrainerTTS: scheduler = getattr(torch.optim, lr_scheduler) return scheduler(optimizer, **lr_scheduler_params) - def init_criterion(self): - return setup_loss(self.config) + @staticmethod + def get_criterion(config: Coqpit) -> nn.Module: + return setup_loss(config) - def restore_model(self, - config, - restore_path, - model, - optimizer, - scaler=None): - logging.info(f" > Restoring from {os.path.basename(restore_path)}...") - checkpoint = torch.load(restore_path, map_location="cpu") + def restore_model( + self, + config: Coqpit, + restore_path: str, + model: nn.Module, + optimizer: torch.optim.Optimizer, + scaler: torch.cuda.amp.GradScaler = None + ) -> Tuple[nn.Module, torch.optim.Optimizer, torch.cuda.amp.GradScaler, int]: + print(" > Restoring from %s ..." % os.path.basename(restore_path)) + checkpoint = torch.load(restore_path) try: logging.info(" > Restoring Model...") model.load_state_dict(checkpoint["model"]) @@ -242,20 +256,20 @@ class TrainerTTS: for group in optimizer.param_groups: group["lr"] = self.config.lr - logging.info(" > Model restored from step %d" % checkpoint["step"], - flush=True) + print(" > Model restored from step %d" % checkpoint["step"], ) restore_step = checkpoint["step"] return model, optimizer, scaler, restore_step - def _setup_loader(self, r, ap, is_eval, data_items, verbose, - speaker_mapping): + def _get_loader(self, r: int, ap: AudioProcessor, is_eval: bool, + data_items: List, verbose: bool, + speaker_mapping: Union[Dict, List]) -> DataLoader: if is_eval and not self.config.run_eval: loader = None else: dataset = TTSDataset( outputs_per_step=r, text_cleaner=self.config.text_cleaner, - compute_linear_spec= 'tacotron' == self.config.model.lower(), + compute_linear_spec=self.config.model.lower() == "tacotron", meta_data=data_items, ap=ap, tp=self.config.characters, @@ -296,17 +310,19 @@ class TrainerTTS: ) return loader - def setup_train_dataloader(self, r, ap, data_items, verbose, - speaker_mapping): - return self._setup_loader(r, ap, False, data_items, verbose, - speaker_mapping) + def get_train_dataloader(self, r: int, ap: AudioProcessor, + data_items: List, verbose: bool, + speaker_mapping: Union[List, Dict]) -> DataLoader: + return self._get_loader(r, ap, False, data_items, verbose, + speaker_mapping) - def setup_eval_dataloder(self, r, ap, data_items, verbose, - speaker_mapping): - return self._setup_loader(r, ap, True, data_items, verbose, - speaker_mapping) + def get_eval_dataloder(self, r: int, ap: AudioProcessor, data_items: List, + verbose: bool, + speaker_mapping: Union[List, Dict]) -> DataLoader: + return self._get_loader(r, ap, True, data_items, verbose, + speaker_mapping) - def format_batch(self, batch): + def format_batch(self, batch: List) -> Dict: # setup input batch text_input = batch[0] text_lengths = batch[1] @@ -401,7 +417,8 @@ class TrainerTTS: "item_idx": item_idx } - def train_step(self, batch, batch_n_steps, step, loader_start_time): + def train_step(self, batch: Dict, batch_n_steps: int, step: int, + loader_start_time: float) -> Tuple[Dict, Dict]: self.on_train_step_start() step_start_time = time.time() @@ -515,7 +532,7 @@ class TrainerTTS: self.on_train_step_end() return outputs, loss_dict - def train_epoch(self): + def train_epoch(self) -> None: self.model.train() epoch_start_time = time.time() if self.use_cuda: @@ -541,7 +558,7 @@ class TrainerTTS: self.tb_logger.tb_model_weights(self.model, self.total_steps_done) - def eval_step(self, batch, step): + def eval_step(self, batch: Dict, step: int) -> Tuple[Dict, Dict]: with torch.no_grad(): step_start_time = time.time() @@ -572,17 +589,11 @@ class TrainerTTS: self.keep_avg_eval.avg_values) return outputs, loss_dict - def eval_epoch(self): + def eval_epoch(self) -> None: self.model.eval() - if self.use_cuda: - batch_num_steps = int( - len(self.train_loader.dataset) / - (self.config.batch_size * self.num_gpus)) - else: - batch_num_steps = int( - len(self.train_loader.dataset) / self.config.batch_size) self.c_logger.print_eval_start() loader_start_time = time.time() + batch = None for cur_step, batch in enumerate(self.eval_loader): # format data batch = self.format_batch(batch) @@ -597,8 +608,8 @@ class TrainerTTS: {"EvalAudio": eval_audios}, self.ap.sample_rate) - def test_run(self, ): - logging.info(" | > Synthesizing test sentences.") + def test_run(self, ) -> None: + print(" | > Synthesizing test sentences.") test_audios = {} test_figures = {} test_sentences = self.config.test_sentences @@ -618,9 +629,11 @@ class TrainerTTS: do_trim_silence=False, ).values() - file_path = os.path.join(self.output_audio_path, str(self.total_steps_done)) + file_path = os.path.join(self.output_audio_path, + str(self.total_steps_done)) os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) + file_path = os.path.join(file_path, + "TestSentence_{}.wav".format(idx)) self.ap.save_wav(wav, file_path) test_audios["{}-audio".format(idx)] = wav test_figures["{}-prediction".format(idx)] = plot_spectrogram( @@ -629,16 +642,17 @@ class TrainerTTS: alignment, output_fig=False) self.tb_logger.tb_test_audios(self.total_steps_done, test_audios, - self.config.audio["sample_rate"]) + self.config.audio["sample_rate"]) self.tb_logger.tb_test_figures(self.total_steps_done, test_figures) - def _get_cond_inputs(self): + def _get_cond_inputs(self) -> Dict: # setup speaker_id speaker_id = 0 if self.config.use_speaker_embedding else None # setup x_vector - x_vector = self.speaker_manager.get_x_vectors_by_speaker( - self.speaker_manager.speaker_ids[0] - ) if self.config.use_external_speaker_embedding_file and self.config.use_speaker_embedding else None + x_vector = (self.speaker_manager.get_x_vectors_by_speaker( + self.speaker_manager.speaker_ids[0]) + if self.config.use_external_speaker_embedding_file + and self.config.use_speaker_embedding else None) # setup style_mel if self.config.has('gst_style_input'): style_wav = self.config.gst_style_input @@ -647,35 +661,40 @@ class TrainerTTS: if style_wav is None and 'use_gst' in self.config and self.config.use_gst: # inicialize GST with zero dict. style_wav = {} - print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") + print( + "WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!" + ) for i in range(self.config.gst["gst_num_style_tokens"]): style_wav[str(i)] = 0 - cond_inputs = {'speaker_id': speaker_id, 'style_wav': style_wav, 'x_vector': x_vector} + cond_inputs = { + "speaker_id": speaker_id, + "style_wav": style_wav, + "x_vector": x_vector + } return cond_inputs - def fit(self): + def fit(self) -> None: if self.restore_step != 0 or self.args.best_path: - logging.info(" > Restoring best loss from " - f"{os.path.basename(self.args.best_path)} ...") + print(" > Restoring best loss from " + f"{os.path.basename(self.args.best_path)} ...") self.best_loss = torch.load(self.args.best_path, map_location="cpu")["model_loss"] - logging.info( - f" > Starting with loaded last best loss {self.best_loss}.") + print(f" > Starting with loaded last best loss {self.best_loss}.") # define data loaders - self.train_loader = self.setup_train_dataloader( + self.train_loader = self.get_train_dataloader( self.config.r, self.ap, self.data_train, verbose=True, speaker_mapping=self.speaker_manager.speaker_ids) - self.eval_loader = self.setup_eval_dataloder( + self.eval_loader = (self.get_eval_dataloder( self.config.r, self.ap, self.data_train, verbose=True, - speaker_mapping=self.speaker_manager.speaker_ids - ) if self.config.run_eval else None + speaker_mapping=self.speaker_manager.speaker_ids) + if self.config.run_eval else None) self.total_steps_done = self.restore_step @@ -697,10 +716,10 @@ class TrainerTTS: self.save_best_model() self.on_epoch_end() - def save_best_model(self): + def save_best_model(self) -> None: self.best_loss = save_best_model( - self.keep_avg_eval['avg_loss'] - if self.keep_avg_eval else self.keep_avg_train['avg_loss'], + self.keep_avg_eval["avg_loss"] + if self.keep_avg_eval else self.keep_avg_train["avg_loss"], self.best_loss, self.model, self.optimizer, @@ -715,8 +734,16 @@ class TrainerTTS: if self.config.mixed_precision else None, ) - def on_epoch_start(self): - if hasattr(self.model, 'on_epoch_start'): + @staticmethod + def _setup_logger_config(log_file: str) -> None: + logging.basicConfig( + level=logging.INFO, + format="", + handlers=[logging.FileHandler(log_file), + logging.StreamHandler()]) + + def on_epoch_start(self) -> None: # pylint: disable=no-self-use + if hasattr(self.model, "on_epoch_start"): self.model.on_epoch_start(self) if hasattr(self.criterion, "on_epoch_start"): @@ -725,8 +752,8 @@ class TrainerTTS: if hasattr(self.optimizer, "on_epoch_start"): self.optimizer.on_epoch_start(self) - def on_epoch_end(self): - if hasattr(self.model, "on_epoch_start"): + def on_epoch_end(self) -> None: # pylint: disable=no-self-use + if hasattr(self.model, "on_epoch_end"): self.model.on_epoch_end(self) if hasattr(self.criterion, "on_epoch_end"): @@ -735,8 +762,8 @@ class TrainerTTS: if hasattr(self.optimizer, "on_epoch_end"): self.optimizer.on_epoch_end(self) - def on_train_step_start(self): - if hasattr(self.model, "on_epoch_start"): + def on_train_step_start(self) -> None: # pylint: disable=no-self-use + if hasattr(self.model, "on_train_step_start"): self.model.on_train_step_start(self) if hasattr(self.criterion, "on_train_step_start"): @@ -745,7 +772,7 @@ class TrainerTTS: if hasattr(self.optimizer, "on_train_step_start"): self.optimizer.on_train_step_start(self) - def on_train_step_end(self): + def on_train_step_end(self) -> None: # pylint: disable=no-self-use if hasattr(self.model, "on_train_step_end"): self.model.on_train_step_end(self) From b2218e882a9386e36e2caac84d1bb628e965cb44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 26 May 2021 09:52:29 +0200 Subject: [PATCH 131/258] update `glow_tts_config.py` for setting the optimizer and the scheduler --- TTS/tts/configs/glow_tts_config.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/TTS/tts/configs/glow_tts_config.py b/TTS/tts/configs/glow_tts_config.py index 36ccb612..214b2377 100644 --- a/TTS/tts/configs/glow_tts_config.py +++ b/TTS/tts/configs/glow_tts_config.py @@ -89,12 +89,13 @@ class GlowTTSConfig(BaseTTSConfig): use_external_speaker_embedding_file: bool = False external_speaker_embedding_file: str = False - # optimizer params - noam_schedule: bool = True - warmup_steps: int = 4000 + # optimizer parameters + optimizer: str = "RAdam" + optimizer_params: dict = field(default_factory=lambda: {'betas': [0.9, 0.998], 'weight_decay': 1e-6}) + lr_scheduler: str = "NoamLR" + lr_scheduler_params: dict = field(default_factory=lambda:{"warmup_steps": 4000}) grad_clip: float = 5.0 lr: float = 1e-3 - wd: float = 0.000001 # overrides min_seq_len: int = 3 From 9134c7dfb69b7e48eaa60e98ee7e1a1963237fdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 26 May 2021 09:53:27 +0200 Subject: [PATCH 132/258] update `sequence_mask` import globally --- TTS/tts/layers/glow_tts/encoder.py | 2 +- TTS/tts/layers/glow_tts/monotonic_align/__init__.py | 2 +- TTS/tts/models/align_tts.py | 2 +- TTS/tts/models/glow_tts.py | 5 ++++- TTS/tts/models/speedy_speech.py | 2 +- tests/tts_tests/test_feed_forward_layers.py | 2 +- tests/tts_tests/test_speedy_speech_layers.py | 2 +- tests/tts_tests/test_tacotron_layers.py | 2 +- 8 files changed, 11 insertions(+), 8 deletions(-) diff --git a/TTS/tts/layers/glow_tts/encoder.py b/TTS/tts/layers/glow_tts/encoder.py index 48bb3008..71aee94f 100644 --- a/TTS/tts/layers/glow_tts/encoder.py +++ b/TTS/tts/layers/glow_tts/encoder.py @@ -9,7 +9,7 @@ from TTS.tts.layers.generic.time_depth_sep_conv import TimeDepthSeparableConvBlo from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor from TTS.tts.layers.glow_tts.glow import ResidualConv1dLayerNormBlock from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask class Encoder(nn.Module): diff --git a/TTS/tts/layers/glow_tts/monotonic_align/__init__.py b/TTS/tts/layers/glow_tts/monotonic_align/__init__.py index 7be124f4..5cbfd8fc 100644 --- a/TTS/tts/layers/glow_tts/monotonic_align/__init__.py +++ b/TTS/tts/layers/glow_tts/monotonic_align/__init__.py @@ -2,7 +2,7 @@ import numpy as np import torch from torch.nn import functional as F -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask try: # TODO: fix pypi cython installation problem. diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index e097ac50..db04b72c 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -7,7 +7,7 @@ from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask class AlignTTS(nn.Module): diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 19eb594a..ca059ab9 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -6,8 +6,11 @@ from torch.nn import functional as F from TTS.tts.layers.glow_tts.decoder import Decoder from TTS.tts.layers.glow_tts.encoder import Encoder +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask class GlowTTS(nn.Module): diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index 9880b82b..bc6e912c 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -6,7 +6,7 @@ from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask class SpeedySpeech(nn.Module): diff --git a/tests/tts_tests/test_feed_forward_layers.py b/tests/tts_tests/test_feed_forward_layers.py index 1db980a3..1c2d3803 100644 --- a/tests/tts_tests/test_feed_forward_layers.py +++ b/tests/tts_tests/test_feed_forward_layers.py @@ -2,7 +2,7 @@ import torch from TTS.tts.layers.feed_forward.decoder import Decoder from TTS.tts.layers.feed_forward.encoder import Encoder -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") diff --git a/tests/tts_tests/test_speedy_speech_layers.py b/tests/tts_tests/test_speedy_speech_layers.py index 3473769b..21a73812 100644 --- a/tests/tts_tests/test_speedy_speech_layers.py +++ b/tests/tts_tests/test_speedy_speech_layers.py @@ -2,7 +2,7 @@ import torch from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor from TTS.tts.models.speedy_speech import SpeedySpeech -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") diff --git a/tests/tts_tests/test_tacotron_layers.py b/tests/tts_tests/test_tacotron_layers.py index 9b89e645..6c4b76b5 100644 --- a/tests/tts_tests/test_tacotron_layers.py +++ b/tests/tts_tests/test_tacotron_layers.py @@ -4,7 +4,7 @@ import torch as T from TTS.tts.layers.losses import L1LossMasked, SSIMLoss from TTS.tts.layers.tacotron.tacotron import CBHG, Decoder, Encoder, Prenet -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask # pylint: disable=unused-variable From bb4deee64cbcdfad8962d1539a8d8b9f2cc11ca6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 26 May 2021 09:54:04 +0200 Subject: [PATCH 133/258] update glow-tts for the trainer --- TTS/tts/models/glow_tts.py | 194 +++++++++++++++++++++++++++++-------- 1 file changed, 153 insertions(+), 41 deletions(-) diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index ca059ab9..09e58ce7 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -38,7 +38,6 @@ class GlowTTS(nn.Module): encoder_params (dict): encoder module parameters. speaker_embedding_dim (int): channels of external speaker embedding vectors. """ - def __init__( self, num_chars, @@ -133,27 +132,29 @@ class GlowTTS(nn.Module): @staticmethod def compute_outputs(attn, o_mean, o_log_scale, x_mask): # compute final values with the computed alignment - y_mean = torch.matmul(attn.squeeze(1).transpose(1, 2), o_mean.transpose(1, 2)).transpose( - 1, 2 - ) # [b, t', t], [b, t, d] -> [b, d, t'] - y_log_scale = torch.matmul(attn.squeeze(1).transpose(1, 2), o_log_scale.transpose(1, 2)).transpose( - 1, 2 - ) # [b, t', t], [b, t, d] -> [b, d, t'] + y_mean = torch.matmul( + attn.squeeze(1).transpose(1, 2), o_mean.transpose(1, 2)).transpose( + 1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] + y_log_scale = torch.matmul( + attn.squeeze(1).transpose(1, 2), o_log_scale.transpose( + 1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] # compute total duration with adjustment o_attn_dur = torch.log(1 + torch.sum(attn, -1)) * x_mask return y_mean, y_log_scale, o_attn_dur - def forward(self, x, x_lengths, y=None, y_lengths=None, attn=None, g=None): + def forward(self, x, x_lengths, y, y_lengths=None, cond_input={'x_vectors':None}): """ Shapes: x: [B, T] x_lenghts: B - y: [B, C, T] + y: [B, T, C] y_lengths: B g: [B, C] or B """ y_max_length = y.size(2) + y = y.transpose(1, 2) # norm speaker embeddings + g = cond_input['x_vectors'] if g is not None: if self.speaker_embedding_dim: g = F.normalize(g).unsqueeze(-1) @@ -161,29 +162,54 @@ class GlowTTS(nn.Module): g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] # embedding pass - o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g) + o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, + x_lengths, + g=g) # drop redisual frames wrt num_squeeze and set y_lengths. - y, y_lengths, y_max_length, attn = self.preprocess(y, y_lengths, y_max_length, None) + y, y_lengths, y_max_length, attn = self.preprocess( + y, y_lengths, y_max_length, None) # create masks - y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), + 1).to(x_mask.dtype) attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) # decoder pass z, logdet = self.decoder(y, y_mask, g=g, reverse=False) # find the alignment path with torch.no_grad(): o_scale = torch.exp(-2 * o_log_scale) - logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, [1]).unsqueeze(-1) # [b, t, 1] - logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * (z ** 2)) # [b, t, d] x [b, d, t'] = [b, t, t'] - logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), z) # [b, t, d] x [b, d, t'] = [b, t, t'] - logp4 = torch.sum(-0.5 * (o_mean ** 2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] + logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, + [1]).unsqueeze(-1) # [b, t, 1] + logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * + (z**2)) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), + z) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp4 = torch.sum(-0.5 * (o_mean**2) * o_scale, + [1]).unsqueeze(-1) # [b, t, 1] logp = logp1 + logp2 + logp3 + logp4 # [b, t, t'] - attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() - y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask) + attn = maximum_path(logp, + attn_mask.squeeze(1)).unsqueeze(1).detach() + y_mean, y_log_scale, o_attn_dur = self.compute_outputs( + attn, o_mean, o_log_scale, x_mask) attn = attn.squeeze(1).permute(0, 2, 1) - return z, logdet, y_mean, y_log_scale, attn, o_dur_log, o_attn_dur + outputs = { + 'model_outputs': z, + 'logdet': logdet, + 'y_mean': y_mean, + 'y_log_scale': y_log_scale, + 'alignments': attn, + 'durations_log': o_dur_log, + 'total_durations_log': o_attn_dur + } + return outputs @torch.no_grad() - def inference_with_MAS(self, x, x_lengths, y=None, y_lengths=None, attn=None, g=None): + def inference_with_MAS(self, + x, + x_lengths, + y=None, + y_lengths=None, + attn=None, + g=None): """ It's similar to the teacher forcing in Tacotron. It was proposed in: https://arxiv.org/abs/2104.05557 @@ -203,24 +229,33 @@ class GlowTTS(nn.Module): g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] # embedding pass - o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g) + o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, + x_lengths, + g=g) # drop redisual frames wrt num_squeeze and set y_lengths. - y, y_lengths, y_max_length, attn = self.preprocess(y, y_lengths, y_max_length, None) + y, y_lengths, y_max_length, attn = self.preprocess( + y, y_lengths, y_max_length, None) # create masks - y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), + 1).to(x_mask.dtype) attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) # decoder pass z, logdet = self.decoder(y, y_mask, g=g, reverse=False) # find the alignment path between z and encoder output o_scale = torch.exp(-2 * o_log_scale) - logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, [1]).unsqueeze(-1) # [b, t, 1] - logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * (z ** 2)) # [b, t, d] x [b, d, t'] = [b, t, t'] - logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), z) # [b, t, d] x [b, d, t'] = [b, t, t'] - logp4 = torch.sum(-0.5 * (o_mean ** 2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] + logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, + [1]).unsqueeze(-1) # [b, t, 1] + logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * + (z**2)) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), + z) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp4 = torch.sum(-0.5 * (o_mean**2) * o_scale, + [1]).unsqueeze(-1) # [b, t, 1] logp = logp1 + logp2 + logp3 + logp4 # [b, t, t'] attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() - y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask) + y_mean, y_log_scale, o_attn_dur = self.compute_outputs( + attn, o_mean, o_log_scale, x_mask) attn = attn.squeeze(1).permute(0, 2, 1) # get predited aligned distribution @@ -228,8 +263,16 @@ class GlowTTS(nn.Module): # reverse the decoder and predict using the aligned distribution y, logdet = self.decoder(z, y_mask, g=g, reverse=True) - - return y, logdet, y_mean, y_log_scale, attn, o_dur_log, o_attn_dur + outputs = { + 'model_outputs': y, + 'logdet': logdet, + 'y_mean': y_mean, + 'y_log_scale': y_log_scale, + 'alignments': attn, + 'durations_log': o_dur_log, + 'total_durations_log': o_attn_dur + } + return outputs @torch.no_grad() def decoder_inference(self, y, y_lengths=None, g=None): @@ -247,7 +290,8 @@ class GlowTTS(nn.Module): else: g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] - y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(y.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), + 1).to(y.dtype) # decoder pass z, logdet = self.decoder(y, y_mask, g=g, reverse=False) @@ -266,28 +310,98 @@ class GlowTTS(nn.Module): g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h] # embedding pass - o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g) + o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, + x_lengths, + g=g) # compute output durations w = (torch.exp(o_dur_log) - 1) * x_mask * self.length_scale w_ceil = torch.ceil(w) y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() y_max_length = None # compute masks - y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), + 1).to(x_mask.dtype) attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) # compute attention mask - attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1) - y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask) + attn = generate_path(w_ceil.squeeze(1), + attn_mask.squeeze(1)).unsqueeze(1) + y_mean, y_log_scale, o_attn_dur = self.compute_outputs( + attn, o_mean, o_log_scale, x_mask) - z = (y_mean + torch.exp(y_log_scale) * torch.randn_like(y_mean) * self.inference_noise_scale) * y_mask + z = (y_mean + torch.exp(y_log_scale) * torch.randn_like(y_mean) * + self.inference_noise_scale) * y_mask # decoder pass y, logdet = self.decoder(z, y_mask, g=g, reverse=True) attn = attn.squeeze(1).permute(0, 2, 1) - return y, logdet, y_mean, y_log_scale, attn, o_dur_log, o_attn_dur + outputs = { + 'model_outputs': y, + 'logdet': logdet, + 'y_mean': y_mean, + 'y_log_scale': y_log_scale, + 'alignments': attn, + 'durations_log': o_dur_log, + 'total_durations_log': o_attn_dur + } + return outputs + + def train_step(self, batch: dict, criterion: nn.Module): + """Perform a single training step by fetching the right set if samples from the batch. + + Args: + batch (dict): [description] + criterion (nn.Module): [description] + """ + text_input = batch['text_input'] + text_lengths = batch['text_lengths'] + mel_input = batch['mel_input'] + mel_lengths = batch['mel_lengths'] + x_vectors = batch['x_vectors'] + + outputs = self.forward(text_input, + text_lengths, + mel_input, + mel_lengths, + cond_input={"x_vectors": x_vectors}) + + loss_dict = criterion(outputs['model_outputs'], outputs['y_mean'], + outputs['y_log_scale'], outputs['logdet'], + mel_lengths, outputs['durations_log'], + outputs['total_durations_log'], text_lengths) + + # compute alignment error (the lower the better ) + align_error = 1 - alignment_diagonal_score(outputs['alignments'], binary=True) + loss_dict["align_error"] = align_error + return outputs, loss_dict + + def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): + model_outputs = outputs['model_outputs'] + alignments = outputs['alignments'] + mel_input = batch['mel_input'] + + pred_spec = model_outputs[0].data.cpu().numpy() + gt_spec = mel_input[0].data.cpu().numpy() + align_img = alignments[0].data.cpu().numpy() + + figures = { + "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), + "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), + "alignment": plot_alignment(align_img, output_fig=False), + } + + # Sample audio + train_audio = ap.inv_melspectrogram(pred_spec.T) + return figures, train_audio + + def eval_step(self, batch: dict, criterion: nn.Module): + return self.train_step(batch, criterion) + + def eval_log(self, ap: AudioProcessor, batch: dict, outputs: dict): + return self.train_log(ap, batch, outputs) def preprocess(self, y, y_lengths, y_max_length, attn=None): if y_max_length is not None: - y_max_length = (y_max_length // self.num_squeeze) * self.num_squeeze + y_max_length = (y_max_length // + self.num_squeeze) * self.num_squeeze y = y[:, :, :y_max_length] if attn is not None: attn = attn[:, :, :, :y_max_length] @@ -297,9 +411,7 @@ class GlowTTS(nn.Module): def store_inverse(self): self.decoder.store_inverse() - def load_checkpoint( - self, config, checkpoint_path, eval=False - ): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin state = torch.load(checkpoint_path, map_location=torch.device("cpu")) self.load_state_dict(state["model"]) if eval: From 4e910993f1177fedd63fef6b7fd8cfbe80f6f2b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 26 May 2021 09:54:48 +0200 Subject: [PATCH 134/258] update tacotron model to return `model_outputs` --- TTS/tts/models/tacotron.py | 8 ++++---- TTS/tts/models/tacotron2.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 23bd839f..34f04159 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -255,7 +255,7 @@ class Tacotron(TacotronAbstract): outputs['alignments_backward'] = alignments_backward outputs['decoder_outputs_backward'] = decoder_outputs_backward outputs.update({ - 'postnet_outputs': postnet_outputs, + 'model_outputs': postnet_outputs, 'decoder_outputs': decoder_outputs, 'alignments': alignments, 'stop_tokens': stop_tokens @@ -287,7 +287,7 @@ class Tacotron(TacotronAbstract): postnet_outputs = self.last_linear(postnet_outputs) decoder_outputs = decoder_outputs.transpose(1, 2) outputs = { - 'postnet_outputs': postnet_outputs, + 'model_outputs': postnet_outputs, 'decoder_outputs': decoder_outputs, 'alignments': alignments, 'stop_tokens': stop_tokens @@ -335,7 +335,7 @@ class Tacotron(TacotronAbstract): # compute loss loss_dict = criterion( - outputs['postnet_outputs'], + outputs['model_outputs'], outputs['decoder_outputs'], mel_input, linear_input, @@ -355,7 +355,7 @@ class Tacotron(TacotronAbstract): return outputs, loss_dict def train_log(self, ap, batch, outputs): - postnet_outputs = outputs['postnet_outputs'] + postnet_outputs = outputs['model_outputs'] alignments = outputs['alignments'] alignments_backward = outputs['alignments_backward'] mel_input = batch['mel_input'] diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 51b181e4..04b97606 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -233,7 +233,7 @@ class Tacotron2(TacotronAbstract): outputs['alignments_backward'] = alignments_backward outputs['decoder_outputs_backward'] = decoder_outputs_backward outputs.update({ - 'postnet_outputs': postnet_outputs, + 'model_outputs': postnet_outputs, 'decoder_outputs': decoder_outputs, 'alignments': alignments, 'stop_tokens': stop_tokens @@ -254,7 +254,7 @@ class Tacotron2(TacotronAbstract): x_vector = self.speaker_embedding(cond_input['speaker_ids'])[:, None] x_vector = torch.unsqueeze(x_vector, 0).transpose(1, 2) else: - x_vector = cond_input + x_vector = cond_input['x_vectors'] encoder_outputs = self._concat_speaker_embedding( encoder_outputs, x_vector) @@ -266,7 +266,7 @@ class Tacotron2(TacotronAbstract): decoder_outputs, postnet_outputs, alignments = self.shape_outputs( decoder_outputs, postnet_outputs, alignments) outputs = { - 'postnet_outputs': postnet_outputs, + 'model_outputs': postnet_outputs, 'decoder_outputs': decoder_outputs, 'alignments': alignments, 'stop_tokens': stop_tokens From 06ee57d816b34e9d66e7bda8f4771a012ae4373c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 26 May 2021 16:02:01 +0200 Subject: [PATCH 135/258] update `speedy_speecy_config.py` for the trainer --- TTS/tts/configs/speedy_speech_config.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/TTS/tts/configs/speedy_speech_config.py b/TTS/tts/configs/speedy_speech_config.py index 1b8f0c82..42258398 100644 --- a/TTS/tts/configs/speedy_speech_config.py +++ b/TTS/tts/configs/speedy_speech_config.py @@ -1,4 +1,5 @@ from dataclasses import dataclass, field +from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig @@ -99,10 +100,11 @@ class SpeedySpeechConfig(BaseTTSConfig): external_speaker_embedding_file: str = False # optimizer parameters - noam_schedule: bool = False - warmup_steps: int = 4000 + optimizer: str = "RAdam" + optimizer_params: dict = field(default_factory=lambda: {'betas': [0.9, 0.998], 'weight_decay': 1e-6}) + lr_scheduler: str = None + lr_scheduler_params: dict = None lr: float = 1e-4 - wd: float = 1e-6 grad_clip: float = 5.0 # loss params @@ -114,3 +116,12 @@ class SpeedySpeechConfig(BaseTTSConfig): min_seq_len: int = 13 max_seq_len: int = 200 r: int = 1 # DO NOT CHANGE + + # testing + test_sentences: List[str] = field(default_factory=lambda:[ + "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + "Be a voice, not an echo.", + "I'm sorry Dave. I'm afraid I can't do that.", + "This cake is great. It's so delicious and moist.", + "Prior to November 22, 1963." + ]) From c70d0c9dae1129b49f8cef3043a22bb2c2612075 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 26 May 2021 16:03:24 +0200 Subject: [PATCH 136/258] update `speedy_speech.py` model for trainer --- TTS/tts/models/speedy_speech.py | 139 +++++++++++++++++++++++++++----- 1 file changed, 121 insertions(+), 18 deletions(-) diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index bc6e912c..daf67b6c 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -3,6 +3,9 @@ from torch import nn from TTS.tts.layers.feed_forward.decoder import Decoder from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path @@ -46,7 +49,12 @@ class SpeedySpeech(nn.Module): positional_encoding=True, length_scale=1, encoder_type="residual_conv_bn", - encoder_params={"kernel_size": 4, "dilations": 4 * [1, 2, 4] + [1], "num_conv_blocks": 2, "num_res_blocks": 13}, + encoder_params={ + "kernel_size": 4, + "dilations": 4 * [1, 2, 4] + [1], + "num_conv_blocks": 2, + "num_res_blocks": 13 + }, decoder_type="residual_conv_bn", decoder_params={ "kernel_size": 4, @@ -60,13 +68,17 @@ class SpeedySpeech(nn.Module): ): super().__init__() - self.length_scale = float(length_scale) if isinstance(length_scale, int) else length_scale + self.length_scale = float(length_scale) if isinstance( + length_scale, int) else length_scale self.emb = nn.Embedding(num_chars, hidden_channels) - self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, encoder_params, c_in_channels) + self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, + encoder_params, c_in_channels) if positional_encoding: self.pos_encoder = PositionalEncoding(hidden_channels) - self.decoder = Decoder(out_channels, hidden_channels, decoder_type, decoder_params) - self.duration_predictor = DurationPredictor(hidden_channels + c_in_channels) + self.decoder = Decoder(out_channels, hidden_channels, decoder_type, + decoder_params) + self.duration_predictor = DurationPredictor(hidden_channels + + c_in_channels) if num_speakers > 1 and not external_c: # speaker embedding layer @@ -93,7 +105,9 @@ class SpeedySpeech(nn.Module): """ attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) attn = generate_path(dr, attn_mask.squeeze(1)).to(en.dtype) - o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2), en.transpose(1, 2)).transpose(1, 2) + o_en_ex = torch.matmul( + attn.squeeze(1).transpose(1, 2), en.transpose(1, + 2)).transpose(1, 2) return o_en_ex, attn def format_durations(self, o_dr_log, x_mask): @@ -127,7 +141,8 @@ class SpeedySpeech(nn.Module): x_emb = torch.transpose(x_emb, 1, -1) # compute sequence masks - x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).to(x.dtype) + x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), + 1).to(x.dtype) # encoder pass o_en = self.encoder(x_emb, x_mask) @@ -140,7 +155,8 @@ class SpeedySpeech(nn.Module): return o_en, o_en_dp, x_mask, g def _forward_decoder(self, o_en, o_en_dp, dr, x_mask, y_lengths, g): - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), + 1).to(o_en_dp.dtype) # expand o_en with durations o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask) # positional encoding @@ -153,8 +169,17 @@ class SpeedySpeech(nn.Module): o_de = self.decoder(o_en_ex, y_mask, g=g) return o_de, attn.transpose(1, 2) - def forward(self, x, x_lengths, y_lengths, dr, g=None): # pylint: disable=unused-argument + def forward(self, + x, + x_lengths, + y_lengths, + dr, + cond_input={ + 'x_vectors': None, + 'speaker_ids': None + }): # pylint: disable=unused-argument """ + TODO: speaker embedding for speaker_ids Shapes: x: [B, T_max] x_lengths: [B] @@ -162,35 +187,113 @@ class SpeedySpeech(nn.Module): dr: [B, T_max] g: [B, C] """ + g = cond_input['x_vectors'] if 'x_vectors' in cond_input else None o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) - o_de, attn = self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g) - return o_de, o_dr_log.squeeze(1), attn + o_de, attn = self._forward_decoder(o_en, + o_en_dp, + dr, + x_mask, + y_lengths, + g=g) + outputs = { + 'model_outputs': o_de.transpose(1, 2), + 'durations_log': o_dr_log.squeeze(1), + 'alignments': attn + } + return outputs - def inference(self, x, x_lengths, g=None): # pylint: disable=unused-argument + def inference(self, + x, + cond_input={ + 'x_vectors': None, + 'speaker_ids': None + }): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] x_lengths: [B] g: [B, C] """ + g = cond_input['x_vectors'] if 'x_vectors' in cond_input else None + x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # input sequence should be greated than the max convolution size inference_padding = 5 if x.shape[1] < 13: inference_padding += 13 - x.shape[1] # pad input to prevent dropping the last word - x = torch.nn.functional.pad(x, pad=(0, inference_padding), mode="constant", value=0) + x = torch.nn.functional.pad(x, + pad=(0, inference_padding), + mode="constant", + value=0) o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) # duration predictor pass o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1) y_lengths = o_dr.sum(1) - o_de, attn = self._forward_decoder(o_en, o_en_dp, o_dr, x_mask, y_lengths, g=g) - return o_de, attn + o_de, attn = self._forward_decoder(o_en, + o_en_dp, + o_dr, + x_mask, + y_lengths, + g=g) + outputs = { + 'model_outputs': o_de.transpose(1, 2), + 'alignments': attn, + 'durations_log': None + } + return outputs - def load_checkpoint( - self, config, checkpoint_path, eval=False - ): # pylint: disable=unused-argument, redefined-builtin + def train_step(self, batch: dict, criterion: nn.Module): + text_input = batch['text_input'] + text_lengths = batch['text_lengths'] + mel_input = batch['mel_input'] + mel_lengths = batch['mel_lengths'] + x_vectors = batch['x_vectors'] + speaker_ids = batch['speaker_ids'] + durations = batch['durations'] + + cond_input = {'x_vectors': x_vectors, 'speaker_ids': speaker_ids} + outputs = self.forward(text_input, text_lengths, mel_lengths, + durations, cond_input) + + # compute loss + loss_dict = criterion(outputs['model_outputs'], mel_input, + mel_lengths, outputs['durations_log'], + torch.log(1 + durations), text_lengths) + + # compute alignment error (the lower the better ) + align_error = 1 - alignment_diagonal_score(outputs['alignments'], + binary=True) + loss_dict["align_error"] = align_error + return outputs, loss_dict + + def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): + model_outputs = outputs['model_outputs'] + alignments = outputs['alignments'] + mel_input = batch['mel_input'] + + pred_spec = model_outputs[0].data.cpu().numpy() + gt_spec = mel_input[0].data.cpu().numpy() + align_img = alignments[0].data.cpu().numpy() + + figures = { + "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), + "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), + "alignment": plot_alignment(align_img, output_fig=False), + } + + # Sample audio + train_audio = ap.inv_melspectrogram(pred_spec.T) + return figures, train_audio + + def eval_step(self, batch: dict, criterion: nn.Module): + return self.train_step(batch, criterion) + + def eval_log(self, ap: AudioProcessor, batch: dict, outputs: dict): + return self.train_log(ap, batch, outputs) + + def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin state = torch.load(checkpoint_path, map_location=torch.device("cpu")) self.load_state_dict(state["model"]) if eval: From b8a4af4010260517fa40ce04980c1e46993f20d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 26 May 2021 16:03:56 +0200 Subject: [PATCH 137/258] update `synthesis.py` for being more generic --- TTS/tts/utils/synthesis.py | 58 ++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 33 deletions(-) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 90017bb1..d42d82f8 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -153,16 +153,6 @@ def run_model_tflite(model, inputs, CONFIG, speaker_id=None, style_mel=None): return decoder_output, postnet_output, None, None -def parse_outputs_torch(postnet_output, decoder_output, alignments, - stop_tokens): - postnet_output = postnet_output[0].data.cpu().numpy() - decoder_output = None if decoder_output is None else decoder_output[ - 0].data.cpu().numpy() - alignment = alignments[0].cpu().data.numpy() - stop_tokens = None if stop_tokens is None else stop_tokens[0].cpu().numpy() - return postnet_output, decoder_output, alignment, stop_tokens - - def parse_outputs_tf(postnet_output, decoder_output, alignments, stop_tokens): postnet_output = postnet_output[0].numpy() decoder_output = decoder_output[0].numpy() @@ -201,8 +191,8 @@ def speaker_id_to_torch(speaker_id, cuda=False): def embedding_to_torch(x_vector, cuda=False): if x_vector is not None: x_vector = np.asarray(x_vector) - x_vector = torch.from_numpy(x_vector).unsqueeze( - 0).type(torch.FloatTensor) + x_vector = torch.from_numpy(x_vector).unsqueeze(0).type( + torch.FloatTensor) if cuda: return x_vector.cuda() return x_vector @@ -264,57 +254,59 @@ def synthesis( else: style_mel = compute_style_mel(style_wav, ap, cuda=use_cuda) # preprocess the given text - inputs = text_to_seq(text, CONFIG) + text_inputs = text_to_seq(text, CONFIG) # pass tensors to backend if backend == "torch": if speaker_id is not None: speaker_id = speaker_id_to_torch(speaker_id, cuda=use_cuda) if x_vector is not None: - x_vector = embedding_to_torch(x_vector, - cuda=use_cuda) + x_vector = embedding_to_torch(x_vector, cuda=use_cuda) if not isinstance(style_mel, dict): style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) - inputs = numpy_to_torch(inputs, torch.long, cuda=use_cuda) - inputs = inputs.unsqueeze(0) + text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=use_cuda) + text_inputs = text_inputs.unsqueeze(0) elif backend == "tf": # TODO: handle speaker id for tf model style_mel = numpy_to_tf(style_mel, tf.float32) - inputs = numpy_to_tf(inputs, tf.int32) - inputs = tf.expand_dims(inputs, 0) + text_inputs = numpy_to_tf(text_inputs, tf.int32) + text_inputs = tf.expand_dims(text_inputs, 0) elif backend == "tflite": style_mel = numpy_to_tf(style_mel, tf.float32) - inputs = numpy_to_tf(inputs, tf.int32) - inputs = tf.expand_dims(inputs, 0) + text_inputs = numpy_to_tf(text_inputs, tf.int32) + text_inputs = tf.expand_dims(text_inputs, 0) # synthesize voice if backend == "torch": outputs = run_model_torch(model, - inputs, + text_inputs, speaker_id, style_mel, x_vector=x_vector) - postnet_output, decoder_output, alignments, stop_tokens = \ - outputs['postnet_outputs'], outputs['decoder_outputs'],\ - outputs['alignments'], outputs['stop_tokens'] - postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_torch( - postnet_output, decoder_output, alignments, stop_tokens) + model_outputs = outputs['model_outputs'] + model_outputs = model_outputs[0].data.cpu().numpy() elif backend == "tf": decoder_output, postnet_output, alignments, stop_tokens = run_model_tf( - model, inputs, CONFIG, speaker_id, style_mel) - postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_tf( + model, text_inputs, CONFIG, speaker_id, style_mel) + model_outputs, decoder_output, alignment, stop_tokens = parse_outputs_tf( postnet_output, decoder_output, alignments, stop_tokens) elif backend == "tflite": decoder_output, postnet_output, alignment, stop_tokens = run_model_tflite( - model, inputs, CONFIG, speaker_id, style_mel) - postnet_output, decoder_output = parse_outputs_tflite( + model, text_inputs, CONFIG, speaker_id, style_mel) + model_outputs, decoder_output = parse_outputs_tflite( postnet_output, decoder_output) # convert outputs to numpy # plot results wav = None if use_griffin_lim: - wav = inv_spectrogram(postnet_output, ap, CONFIG) + wav = inv_spectrogram(model_outputs, ap, CONFIG) # trim silence if do_trim_silence: wav = trim_silence(wav, ap) - return wav, alignment, decoder_output, postnet_output, stop_tokens, inputs + return_dict = { + 'wav': wav, + 'alignments': outputs['alignments'], + 'model_outputs': model_outputs, + 'text_inputs': text_inputs + } + return return_dict From e298b8e364bc6022915113241005eeda5923c818 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 10:24:26 +0200 Subject: [PATCH 138/258] update trainer.py for better logging handling, restoring models and rename init_ functions with get_ --- TTS/bin/train_tts.py | 6 +++++- TTS/trainer.py | 22 ++++++++++------------ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 5058d341..7cc8a25f 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -10,7 +10,11 @@ def main(): # try: args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training( sys.argv) - trainer = TrainerTTS(args, config, c_logger, tb_logger, output_path=OUT_PATH) + trainer = TrainerTTS(args, + config, + c_logger, + tb_logger, + output_path=OUT_PATH) trainer.fit() # except KeyboardInterrupt: # remove_experiment_folder(OUT_PATH) diff --git a/TTS/trainer.py b/TTS/trainer.py index 3beb281f..6087f1bc 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -1,12 +1,13 @@ # -*- coding: utf-8 -*- +import importlib +import logging import os import sys import time import traceback +from logging import StreamHandler from random import randrange -import logging -import importlib import numpy as np import torch @@ -16,19 +17,19 @@ from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler -from TTS.tts.datasets import load_meta_data, TTSDataset +from TTS.tts.datasets import TTSDataset, load_meta_data from TTS.tts.layers import setup_loss from TTS.tts.models import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.arguments import init_training -from TTS.tts.utils.visual import plot_spectrogram, plot_alignment from TTS.utils.audio import AudioProcessor from TTS.utils.distribute import init_distributed, reduce_tensor -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict, find_module -from TTS.utils.training import setup_torch_training_env, check_update +from TTS.utils.generic_utils import KeepAverage, count_parameters, find_module, remove_experiment_folder, set_init_dict +from TTS.utils.training import check_update, setup_torch_training_env @dataclass @@ -140,9 +141,8 @@ class TrainerTTS: self.config, args.restore_path, self.model, self.optimizer, self.scaler) - if self.use_cuda: - self.model.cuda() - self.criterion.cuda() + # setup scheduler + self.scheduler = self.get_scheduler(self.config, self.optimizer) # DISTRUBUTED if self.num_gpus > 1: @@ -150,8 +150,7 @@ class TrainerTTS: # count model size num_params = count_parameters(self.model) - logging.info("\n > Model has {} parameters".format(num_params), - flush=True) + logging.info("\n > Model has {} parameters".format(num_params)) @staticmethod def get_model(num_chars: int, num_speakers: int, config: Coqpit, @@ -241,7 +240,6 @@ class TrainerTTS: try: logging.info(" > Restoring Model...") model.load_state_dict(checkpoint["model"]) - # optimizer restore logging.info(" > Restoring Optimizer...") optimizer.load_state_dict(checkpoint["optimizer"]) if "scaler" in checkpoint and config.mixed_precision: From fc9a0fb8ced419a23cc346c28162c2b60bc37deb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 10:25:25 +0200 Subject: [PATCH 139/258] update aling_tts_config for the trainer --- TTS/tts/configs/align_tts_config.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/TTS/tts/configs/align_tts_config.py b/TTS/tts/configs/align_tts_config.py index 2956d935..115e969c 100644 --- a/TTS/tts/configs/align_tts_config.py +++ b/TTS/tts/configs/align_tts_config.py @@ -92,13 +92,24 @@ class AlignTTSConfig(BaseTTSConfig): external_speaker_embedding_file: str = False # optimizer parameters - noam_schedule: bool = False - warmup_steps: int = 4000 + optimizer: str = "Adam" + optimizer_params: dict = field(default_factory=lambda: {'betas': [0.9, 0.998], 'weight_decay': 1e-6}) + lr_scheduler: str = None + lr_scheduler_params: dict = None lr: float = 1e-4 - wd: float = 1e-6 grad_clip: float = 5.0 # overrides min_seq_len: int = 13 max_seq_len: int = 200 r: int = 1 + + # testing + test_sentences: List[str] = field(default_factory=lambda:[ + "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + "Be a voice, not an echo.", + "I'm sorry Dave. I'm afraid I can't do that.", + "This cake is great. It's so delicious and moist.", + "Prior to November 22, 1963." + ]) + From 9203b863d92616a619e9f4593a500be6a8855a0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 10:25:40 +0200 Subject: [PATCH 140/258] update align_tts_loss for trainer --- TTS/tts/layers/losses.py | 42 +++++----------------------------------- 1 file changed, 5 insertions(+), 37 deletions(-) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 27c6e9e5..517eb533 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -462,13 +462,12 @@ class MDNLoss(nn.Module): class AlignTTSLoss(nn.Module): """Modified AlignTTS Loss. - Computes following losses + Computes - L1 and SSIM losses from output spectrograms. - Huber loss for duration predictor. - MDNLoss for Mixture of Density Network. - All the losses are aggregated by a weighted sum with the loss alphas. - Alphas can be scheduled based on number of steps. + All loss values are aggregated by a weighted sum of the alpha values. Args: c (dict): TTS model configuration. @@ -487,9 +486,9 @@ class AlignTTSLoss(nn.Module): self.mdn_alpha = c.mdn_alpha def forward( - self, logp, decoder_output, decoder_target, decoder_output_lens, dur_output, dur_target, input_lens, step, phase + self, logp, decoder_output, decoder_target, decoder_output_lens, dur_output, dur_target, input_lens, phase ): - ssim_alpha, dur_loss_alpha, spec_loss_alpha, mdn_alpha = self.set_alphas(step) + # ssim_alpha, dur_loss_alpha, spec_loss_alpha, mdn_alpha = self.set_alphas(step) spec_loss, ssim_loss, dur_loss, mdn_loss = 0, 0, 0, 0 if phase == 0: mdn_loss = self.mdn_loss(logp, input_lens, decoder_output_lens) @@ -507,36 +506,5 @@ class AlignTTSLoss(nn.Module): spec_loss = self.spec_loss(decoder_output, decoder_target, decoder_output_lens) ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens) dur_loss = self.dur_loss(dur_output.unsqueeze(2), dur_target.unsqueeze(2), input_lens) - loss = spec_loss_alpha * spec_loss + ssim_alpha * ssim_loss + dur_loss_alpha * dur_loss + mdn_alpha * mdn_loss + loss = self.spec_loss_alpha * spec_loss + self.ssim_alpha * ssim_loss + self.dur_loss_alpha * dur_loss + self.mdn_alpha * mdn_loss return {"loss": loss, "loss_l1": spec_loss, "loss_ssim": ssim_loss, "loss_dur": dur_loss, "mdn_loss": mdn_loss} - - @staticmethod - def _set_alpha(step, alpha_settings): - """Set the loss alpha wrt number of steps. - Return the corresponding value if no schedule is set. - - Example: - Setting a alpha schedule. - if ```alpha_settings``` is ```[[0, 1], [10000, 0.1]]``` then ```return_alpha == 1``` until 10k steps, then set to 0.1. - if ```alpha_settings``` is a constant value then ```return_alpha``` is set to that constant. - - Args: - step (int): number of training steps. - alpha_settings (int or list): constant alpha value or a list defining the schedule as explained above. - """ - return_alpha = None - if isinstance(alpha_settings, list): - for key, alpha in alpha_settings: - if key < step: - return_alpha = alpha - elif isinstance(alpha_settings, (float, int)): - return_alpha = alpha_settings - return return_alpha - - def set_alphas(self, step): - """Set the alpha values for all the loss functions""" - ssim_alpha = self._set_alpha(step, self.ssim_alpha) - dur_loss_alpha = self._set_alpha(step, self.dur_loss_alpha) - spec_loss_alpha = self._set_alpha(step, self.spec_loss_alpha) - mdn_alpha = self._set_alpha(step, self.mdn_alpha) - return ssim_alpha, dur_loss_alpha, spec_loss_alpha, mdn_alpha From bb355b7441445726104eca69a2f28e075dd09632 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 10:26:09 +0200 Subject: [PATCH 141/258] update align_tts.py model for the trainer --- TTS/tts/models/align_tts.py | 198 ++++++++++++++++++++++++++++++------ 1 file changed, 168 insertions(+), 30 deletions(-) diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index db04b72c..6d61eae2 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -4,6 +4,9 @@ import torch.nn as nn from TTS.tts.layers.align_tts.mdn import MDNBlock from TTS.tts.layers.feed_forward.decoder import Decoder from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path @@ -69,9 +72,19 @@ class AlignTTS(nn.Module): hidden_channels=256, hidden_channels_dp=256, encoder_type="fftransformer", - encoder_params={"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}, + encoder_params={ + "hidden_channels_ffn": 1024, + "num_heads": 2, + "num_layers": 6, + "dropout_p": 0.1 + }, decoder_type="fftransformer", - decoder_params={"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}, + decoder_params={ + "hidden_channels_ffn": 1024, + "num_heads": 2, + "num_layers": 6, + "dropout_p": 0.1 + }, length_scale=1, num_speakers=0, external_c=False, @@ -79,11 +92,15 @@ class AlignTTS(nn.Module): ): super().__init__() - self.length_scale = float(length_scale) if isinstance(length_scale, int) else length_scale + self.phase = -1 + self.length_scale = float(length_scale) if isinstance( + length_scale, int) else length_scale self.emb = nn.Embedding(num_chars, hidden_channels) self.pos_encoder = PositionalEncoding(hidden_channels) - self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, encoder_params, c_in_channels) - self.decoder = Decoder(out_channels, hidden_channels, decoder_type, decoder_params) + self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, + encoder_params, c_in_channels) + self.decoder = Decoder(out_channels, hidden_channels, decoder_type, + decoder_params) self.duration_predictor = DurationPredictor(hidden_channels_dp) self.mod_layer = nn.Conv1d(hidden_channels, hidden_channels, 1) @@ -104,9 +121,9 @@ class AlignTTS(nn.Module): mu = mu.transpose(1, 2).unsqueeze(2) # [B, T2, 1, D] log_sigma = log_sigma.transpose(1, 2).unsqueeze(2) # [B, T2, 1, D] expanded_y, expanded_mu = torch.broadcast_tensors(y, mu) - exponential = -0.5 * torch.mean( - torch._C._nn.mse_loss(expanded_y, expanded_mu, 0) / torch.pow(log_sigma.exp(), 2), dim=-1 - ) # B, L, T + exponential = -0.5 * torch.mean(torch._C._nn.mse_loss( + expanded_y, expanded_mu, 0) / torch.pow(log_sigma.exp(), 2), + dim=-1) # B, L, T logp = exponential - 0.5 * log_sigma.mean(dim=-1) return logp @@ -140,7 +157,9 @@ class AlignTTS(nn.Module): [1, 0, 0, 0, 0, 0, 0]] """ attn = self.convert_dr_to_align(dr, x_mask, y_mask) - o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2), en.transpose(1, 2)).transpose(1, 2) + o_en_ex = torch.matmul( + attn.squeeze(1).transpose(1, 2), en.transpose(1, + 2)).transpose(1, 2) return o_en_ex, attn def format_durations(self, o_dr_log, x_mask): @@ -174,7 +193,8 @@ class AlignTTS(nn.Module): x_emb = torch.transpose(x_emb, 1, -1) # compute sequence masks - x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).to(x.dtype) + x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), + 1).to(x.dtype) # encoder pass o_en = self.encoder(x_emb, x_mask) @@ -187,7 +207,8 @@ class AlignTTS(nn.Module): return o_en, o_en_dp, x_mask, g def _forward_decoder(self, o_en, o_en_dp, dr, x_mask, y_lengths, g): - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), + 1).to(o_en_dp.dtype) # expand o_en with durations o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask) # positional encoding @@ -203,11 +224,13 @@ class AlignTTS(nn.Module): def _forward_mdn(self, o_en, y, y_lengths, x_mask): # MAS potentials and alignment mu, log_sigma = self.mdn_block(o_en) - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en.dtype) - dr_mas, logp = self.compute_align_path(mu, log_sigma, y, x_mask, y_mask) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), + 1).to(o_en.dtype) + dr_mas, logp = self.compute_align_path(mu, log_sigma, y, x_mask, + y_mask) return dr_mas, mu, log_sigma, logp - def forward(self, x, x_lengths, y, y_lengths, phase=None, g=None): # pylint: disable=unused-argument + def forward(self, x, x_lengths, y, y_lengths, cond_input={"x_vectors": None}, phase=None): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] @@ -216,47 +239,85 @@ class AlignTTS(nn.Module): dr: [B, T_max] g: [B, C] """ + y = y.transpose(1, 2) + g = cond_input['x_vectors'] if 'x_vectors' in cond_input else None o_de, o_dr_log, dr_mas_log, attn, mu, log_sigma, logp = None, None, None, None, None, None, None if phase == 0: # train encoder and MDN o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) - dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask) - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype) + dr_mas, mu, log_sigma, logp = self._forward_mdn( + o_en, y, y_lengths, x_mask) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), + 1).to(o_en_dp.dtype) attn = self.convert_dr_to_align(dr_mas, x_mask, y_mask) elif phase == 1: # train decoder o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) dr_mas, _, _, _ = self._forward_mdn(o_en, y, y_lengths, x_mask) - o_de, attn = self._forward_decoder(o_en.detach(), o_en_dp.detach(), dr_mas.detach(), x_mask, y_lengths, g=g) + o_de, attn = self._forward_decoder(o_en.detach(), + o_en_dp.detach(), + dr_mas.detach(), + x_mask, + y_lengths, + g=g) elif phase == 2: # train the whole except duration predictor o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) - dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask) - o_de, attn = self._forward_decoder(o_en, o_en_dp, dr_mas, x_mask, y_lengths, g=g) + dr_mas, mu, log_sigma, logp = self._forward_mdn( + o_en, y, y_lengths, x_mask) + o_de, attn = self._forward_decoder(o_en, + o_en_dp, + dr_mas, + x_mask, + y_lengths, + g=g) elif phase == 3: # train duration predictor o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) o_dr_log = self.duration_predictor(x, x_mask) - dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask) - o_de, attn = self._forward_decoder(o_en, o_en_dp, dr_mas, x_mask, y_lengths, g=g) + dr_mas, mu, log_sigma, logp = self._forward_mdn( + o_en, y, y_lengths, x_mask) + o_de, attn = self._forward_decoder(o_en, + o_en_dp, + dr_mas, + x_mask, + y_lengths, + g=g) o_dr_log = o_dr_log.squeeze(1) else: o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) - dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask) - o_de, attn = self._forward_decoder(o_en, o_en_dp, dr_mas, x_mask, y_lengths, g=g) + dr_mas, mu, log_sigma, logp = self._forward_mdn( + o_en, y, y_lengths, x_mask) + o_de, attn = self._forward_decoder(o_en, + o_en_dp, + dr_mas, + x_mask, + y_lengths, + g=g) o_dr_log = o_dr_log.squeeze(1) dr_mas_log = torch.log(dr_mas + 1).squeeze(1) - return o_de, o_dr_log, dr_mas_log, attn, mu, log_sigma, logp + outputs = { + 'model_outputs': o_de.transpose(1, 2), + 'alignments': attn, + 'durations_log': o_dr_log, + 'durations_mas_log': dr_mas_log, + 'mu': mu, + 'log_sigma': log_sigma, + 'logp': logp + } + return outputs @torch.no_grad() - def inference(self, x, x_lengths, g=None): # pylint: disable=unused-argument + def inference(self, x, cond_input={'x_vectors': None}): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] x_lengths: [B] g: [B, C] """ + g = cond_input['x_vectors'] if 'x_vectors' in cond_input else None + x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # pad input to prevent dropping the last word # x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0) o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) @@ -265,14 +326,91 @@ class AlignTTS(nn.Module): # duration predictor pass o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1) y_lengths = o_dr.sum(1) - o_de, attn = self._forward_decoder(o_en, o_en_dp, o_dr, x_mask, y_lengths, g=g) - return o_de, attn + o_de, attn = self._forward_decoder(o_en, + o_en_dp, + o_dr, + x_mask, + y_lengths, + g=g) + outputs = {'model_outputs': o_de.transpose(1, 2), 'alignments': attn} + return outputs - def load_checkpoint( - self, config, checkpoint_path, eval=False - ): # pylint: disable=unused-argument, redefined-builtin + def train_step(self, batch: dict, criterion: nn.Module): + text_input = batch['text_input'] + text_lengths = batch['text_lengths'] + mel_input = batch['mel_input'] + mel_lengths = batch['mel_lengths'] + x_vectors = batch['x_vectors'] + speaker_ids = batch['speaker_ids'] + + cond_input = {'x_vectors': x_vectors, 'speaker_ids': speaker_ids} + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input, self.phase) + loss_dict = criterion( + outputs['logp'], + outputs['model_outputs'], + mel_input, + mel_lengths, + outputs['durations_log'], + outputs['durations_mas_log'], + text_lengths, + phase=self.phase, + ) + + # compute alignment error (the lower the better ) + align_error = 1 - alignment_diagonal_score(outputs['alignments'], + binary=True) + loss_dict["align_error"] = align_error + return outputs, loss_dict + + def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): + model_outputs = outputs['model_outputs'] + alignments = outputs['alignments'] + mel_input = batch['mel_input'] + + pred_spec = model_outputs[0].data.cpu().numpy() + gt_spec = mel_input[0].data.cpu().numpy() + align_img = alignments[0].data.cpu().numpy() + + figures = { + "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), + "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), + "alignment": plot_alignment(align_img, output_fig=False), + } + + # Sample audio + train_audio = ap.inv_melspectrogram(pred_spec.T) + return figures, train_audio + + def eval_step(self, batch: dict, criterion: nn.Module): + return self.train_step(batch, criterion) + + def eval_log(self, ap: AudioProcessor, batch: dict, outputs: dict): + return self.train_log(ap, batch, outputs) + + def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin state = torch.load(checkpoint_path, map_location=torch.device("cpu")) self.load_state_dict(state["model"]) if eval: self.eval() assert not self.training + + @staticmethod + def _set_phase(config, global_step): + """Decide AlignTTS training phase""" + if isinstance(config.phase_start_steps, list): + vals = [i < global_step for i in config.phase_start_steps] + if not True in vals: + phase = 0 + else: + phase = ( + len(config.phase_start_steps) + - [i < global_step for i in config.phase_start_steps][::-1].index(True) + - 1 + ) + else: + phase = None + return phase + + def on_epoch_start(self, trainer): + """Set AlignTTS training phase on epoch start.""" + self.phase = self._set_phase(trainer.config, trainer.total_steps_done) From 7dff6be87191cd9cbdfa100a151bfd8af588a3bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 10:26:31 +0200 Subject: [PATCH 142/258] update tts training tests to use the trainer --- tests/tts_tests/test_align_tts_train.py | 5 +++-- tests/tts_tests/test_glow_tts_train.py | 7 ++++--- tests/tts_tests/test_speedy_speech_train.py | 7 ++++--- tests/tts_tests/test_tacotron2_train.py | 5 +++-- tests/tts_tests/test_tacotron_train.py | 5 +++-- 5 files changed, 17 insertions(+), 12 deletions(-) diff --git a/tests/tts_tests/test_align_tts_train.py b/tests/tts_tests/test_align_tts_train.py index 848f46c1..4bf3802f 100644 --- a/tests/tts_tests/test_align_tts_train.py +++ b/tests/tts_tests/test_align_tts_train.py @@ -30,12 +30,13 @@ config.save_json(config_path) # train the model for one epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_align_tts.py --config_path {config_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs -1" ) run_cli(command_train) @@ -44,7 +45,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_align_tts.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py index e44f6365..0ae25701 100644 --- a/tests/tts_tests/test_glow_tts_train.py +++ b/tests/tts_tests/test_glow_tts_train.py @@ -31,13 +31,14 @@ config.save_json(config_path) # train the model for one epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_glow_tts.py --config_path {config_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt" + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" ) run_cli(command_train) @@ -46,7 +47,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_glow_tts.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index 9dcf0ad8..c8716fb0 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -30,13 +30,14 @@ config.save_json(config_path) # train the model for one epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_speedy_speech.py --config_path {config_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt" + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" ) run_cli(command_train) @@ -45,7 +46,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_speedy_speech.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py index dbec309b..aef507a5 100644 --- a/tests/tts_tests/test_tacotron2_train.py +++ b/tests/tts_tests/test_tacotron2_train.py @@ -31,12 +31,13 @@ config.save_json(config_path) # train the model for one epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tacotron.py --config_path {config_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " ) run_cli(command_train) @@ -45,7 +46,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tacotron.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_tacotron_train.py b/tests/tts_tests/test_tacotron_train.py index 34ee6e06..771ad93c 100644 --- a/tests/tts_tests/test_tacotron_train.py +++ b/tests/tts_tests/test_tacotron_train.py @@ -30,12 +30,13 @@ config.save_json(config_path) # train the model for one epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tacotron.py --config_path {config_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0" ) run_cli(command_train) @@ -44,7 +45,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tacotron.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) From d25f017b421b2f965c6e61c8c3023e387d6575a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 10:47:24 +0200 Subject: [PATCH 143/258] update `setup_model.py` imports --- TTS/bin/compute_attention_masks.py | 2 +- TTS/bin/convert_tacotron2_torch_to_tf.py | 2 +- TTS/bin/extract_tts_spectrograms.py | 2 +- TTS/bin/train_align_tts.py | 2 +- TTS/bin/train_glow_tts.py | 2 +- TTS/bin/train_speedy_speech.py | 2 +- TTS/bin/train_tacotron.py | 2 +- TTS/utils/synthesizer.py | 2 +- tests/inference_tests/test_synthesizer.py | 2 +- tests/test_extract_tts_spectrograms.py | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index e14ff433..3cbf40ba 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -9,7 +9,7 @@ from torch.utils.data import DataLoader from tqdm import tqdm from TTS.tts.datasets.TTSDataset import TTSDataset -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.models import setup_model from TTS.tts.utils.io import load_checkpoint from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.utils.audio import AudioProcessor diff --git a/TTS/bin/convert_tacotron2_torch_to_tf.py b/TTS/bin/convert_tacotron2_torch_to_tf.py index d523d01e..e7f991be 100644 --- a/TTS/bin/convert_tacotron2_torch_to_tf.py +++ b/TTS/bin/convert_tacotron2_torch_to_tf.py @@ -11,7 +11,7 @@ import torch from TTS.tts.tf.models.tacotron2 import Tacotron2 from TTS.tts.tf.utils.convert_torch_to_tf_utils import compare_torch_tf, convert_tf_name, transfer_weights_torch_to_tf from TTS.tts.tf.utils.generic_utils import save_checkpoint -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.models import setup_model from TTS.tts.utils.text.symbols import phonemes, symbols from TTS.utils.io import load_config diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index da6de9c0..970600aa 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -12,7 +12,7 @@ from tqdm import tqdm from TTS.config import load_config from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.models import setup_model from TTS.tts.utils.speakers import parse_speakers from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.utils.audio import AudioProcessor diff --git a/TTS/bin/train_align_tts.py b/TTS/bin/train_align_tts.py index d231484a..b94d7a65 100644 --- a/TTS/bin/train_align_tts.py +++ b/TTS/bin/train_align_tts.py @@ -16,7 +16,7 @@ from torch.utils.data.distributed import DistributedSampler from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import AlignTTSLoss -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.models import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.speakers import parse_speakers diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index 9a455a1b..9df185ee 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -17,7 +17,7 @@ from torch.utils.data.distributed import DistributedSampler from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import GlowTTSLoss -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.models import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.speakers import parse_speakers diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py index 742a27d8..57ff4272 100644 --- a/TTS/bin/train_speedy_speech.py +++ b/TTS/bin/train_speedy_speech.py @@ -18,7 +18,7 @@ from torch.utils.data.distributed import DistributedSampler from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import SpeedySpeechLoss -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.models import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.speakers import parse_speakers diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index b5e38b80..f833ffc6 100755 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -14,7 +14,7 @@ from torch.utils.data import DataLoader from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import TacotronLoss -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.models import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.speakers import parse_speakers diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 5962950f..8af95a12 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -6,7 +6,7 @@ import pysbd import torch from TTS.config import load_config -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.models import setup_model from TTS.tts.utils.speakers import SpeakerManager # pylint: disable=unused-wildcard-import diff --git a/tests/inference_tests/test_synthesizer.py b/tests/inference_tests/test_synthesizer.py index a1cd4de5..b0fa22d3 100644 --- a/tests/inference_tests/test_synthesizer.py +++ b/tests/inference_tests/test_synthesizer.py @@ -3,7 +3,7 @@ import unittest from tests import get_tests_output_path from TTS.config import load_config -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.models import setup_model from TTS.tts.utils.io import save_checkpoint from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.utils.synthesizer import Synthesizer diff --git a/tests/test_extract_tts_spectrograms.py b/tests/test_extract_tts_spectrograms.py index 38cee473..ddc7e4da 100644 --- a/tests/test_extract_tts_spectrograms.py +++ b/tests/test_extract_tts_spectrograms.py @@ -5,7 +5,7 @@ import torch from tests import get_tests_input_path, get_tests_output_path, run_cli from TTS.config import load_config -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.models import setup_model from TTS.tts.utils.text.symbols import phonemes, symbols torch.manual_seed(1) From 73bf9673ed575f571d0374bc43f7a2ed102dce49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 11:38:46 +0200 Subject: [PATCH 144/258] revert logging.info to print statements for trainer --- TTS/trainer.py | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index 6087f1bc..63b9cd42 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -150,7 +150,7 @@ class TrainerTTS: # count model size num_params = count_parameters(self.model) - logging.info("\n > Model has {} parameters".format(num_params)) + print("\n > Model has {} parameters".format(num_params)) @staticmethod def get_model(num_chars: int, num_speakers: int, config: Coqpit, @@ -186,7 +186,6 @@ class TrainerTTS: out_path: str = "", data_train: List = []) -> SpeakerManager: speaker_manager = SpeakerManager() - if config.use_speaker_embedding: if restore_path: speakers_file = os.path.join(os.path.dirname(restore_path), "speaker.json") @@ -196,16 +195,6 @@ class TrainerTTS: ) speakers_file = config.external_speaker_embedding_file - if config.use_external_speaker_embedding_file: - speaker_manager.load_x_vectors_file(speakers_file) - else: - speaker_manager.load_ids_file(speakers_file) - elif config.use_external_speaker_embedding_file and config.external_speaker_embedding_file: - speaker_manager.load_x_vectors_file( - config.external_speaker_embedding_file) - else: - speaker_manager.parse_speakers_from_items(data_train) - file_path = os.path.join(out_path, "speakers.json") speaker_manager.save_ids_file(file_path) return speaker_manager @@ -238,15 +227,15 @@ class TrainerTTS: print(" > Restoring from %s ..." % os.path.basename(restore_path)) checkpoint = torch.load(restore_path) try: - logging.info(" > Restoring Model...") + print(" > Restoring Model...") model.load_state_dict(checkpoint["model"]) - logging.info(" > Restoring Optimizer...") + print(" > Restoring Optimizer...") optimizer.load_state_dict(checkpoint["optimizer"]) if "scaler" in checkpoint and config.mixed_precision: - logging.info(" > Restoring AMP Scaler...") + print(" > Restoring AMP Scaler...") scaler.load_state_dict(checkpoint["scaler"]) except (KeyError, RuntimeError): - logging.info(" > Partial model initialization...") + print(" > Partial model initialization...") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint["model"], config) model.load_state_dict(model_dict) From c680a07a2055086f5be436df295dc5cad5613f17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 11:39:34 +0200 Subject: [PATCH 145/258] fix `Synthesized` for the new `synthesis()` --- TTS/tts/utils/synthesis.py | 36 ------------------------------------ TTS/utils/synthesizer.py | 6 ++++-- 2 files changed, 4 insertions(+), 38 deletions(-) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index d42d82f8..d58886e9 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -78,42 +78,6 @@ def run_model_torch(model, 'x_vector': x_vector, 'style_mel': style_mel }) - # elif "glow" in CONFIG.model.lower(): - # inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable - # if hasattr(model, "module"): - # # distributed model - # postnet_output, _, _, _, alignments, _, _ = model.module.inference( - # inputs, - # inputs_lengths, - # g=speaker_id if speaker_id is not None else speaker_embeddings) - # else: - # postnet_output, _, _, _, alignments, _, _ = model.inference( - # inputs, - # inputs_lengths, - # g=speaker_id if speaker_id is not None else speaker_embeddings) - # postnet_output = postnet_output.permute(0, 2, 1) - # # these only belong to tacotron models. - # decoder_output = None - # stop_tokens = None - # elif CONFIG.model.lower() in ["speedy_speech", "align_tts"]: - # inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable - # if hasattr(model, "module"): - # # distributed model - # postnet_output, alignments = model.module.inference( - # inputs, - # inputs_lengths, - # g=speaker_id if speaker_id is not None else speaker_embeddings) - # else: - # postnet_output, alignments = model.inference( - # inputs, - # inputs_lengths, - # g=speaker_id if speaker_id is not None else speaker_embeddings) - # postnet_output = postnet_output.permute(0, 2, 1) - # # these only belong to tacotron models. - # decoder_output = None - # stop_tokens = None - # else: - # raise ValueError("[!] Unknown model name.") return outputs diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 8af95a12..a8332eb8 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -222,7 +222,7 @@ class Synthesizer(object): for sen in sens: # synthesize voice - waveform, _, _, mel_postnet_spec, _, _ = synthesis( + outputs = synthesis( model=self.tts_model, text=sen, CONFIG=self.tts_config, @@ -232,8 +232,10 @@ class Synthesizer(object): style_wav=style_wav, enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars, use_griffin_lim=use_gl, - speaker_embedding=speaker_embedding, + x_vector=speaker_embedding, ) + waveform = outputs['wav'] + mel_postnet_spec = outputs['model_outputs'] if not use_gl: # denormalize tts output based on tts audio config mel_postnet_spec = self.ap.denormalize(mel_postnet_spec.T).T From 8f47f959987513ba0a4e12bd7ce280c6f5da05e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 15:18:10 +0200 Subject: [PATCH 146/258] correct import of `load_meta_data` remove redundant import --- TTS/bin/compute_embeddings.py | 2 +- TTS/bin/compute_statistics.py | 2 +- TTS/bin/extract_tts_spectrograms.py | 2 +- TTS/bin/train_align_tts.py | 2 +- TTS/bin/train_encoder.py | 2 +- TTS/bin/train_glow_tts.py | 2 +- TTS/bin/train_speedy_speech.py | 2 +- TTS/bin/train_tacotron.py | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 872fc875..885d66b3 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -7,7 +7,7 @@ from tqdm import tqdm from TTS.config import BaseDatasetConfig, load_config from TTS.speaker_encoder.utils.generic_utils import setup_model -from TTS.tts.datasets.preprocess import load_meta_data +from TTS.tts.datasets import load_meta_data from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index f3234c2a..25e3fce5 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -10,7 +10,7 @@ from tqdm import tqdm # from TTS.utils.io import load_config from TTS.config import load_config -from TTS.tts.datasets.preprocess import load_meta_data +from TTS.tts.datasets import load_meta_data from TTS.utils.audio import AudioProcessor diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 970600aa..934055e4 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -10,7 +10,7 @@ from torch.utils.data import DataLoader from tqdm import tqdm from TTS.config import load_config -from TTS.tts.datasets.preprocess import load_meta_data +from TTS.tts.datasets import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.models import setup_model from TTS.tts.utils.speakers import parse_speakers diff --git a/TTS/bin/train_align_tts.py b/TTS/bin/train_align_tts.py index b94d7a65..34eba7a8 100644 --- a/TTS/bin/train_align_tts.py +++ b/TTS/bin/train_align_tts.py @@ -13,7 +13,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler -from TTS.tts.datasets.preprocess import load_meta_data +from TTS.tts.datasets import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import AlignTTSLoss from TTS.tts.models import setup_model diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 48309dc9..6e4a9b32 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -13,7 +13,7 @@ from TTS.speaker_encoder.dataset import SpeakerEncoderDataset from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_model from TTS.speaker_encoder.utils.visual import plot_embeddings -from TTS.tts.datasets.preprocess import load_meta_data +from TTS.tts.datasets import load_meta_data from TTS.utils.arguments import init_training from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import count_parameters, remove_experiment_folder, set_init_dict diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index 9df185ee..a138abeb 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -14,7 +14,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler -from TTS.tts.datasets.preprocess import load_meta_data +from TTS.tts.datasets import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import GlowTTSLoss from TTS.tts.models import setup_model diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py index 57ff4272..4dc3f5f0 100644 --- a/TTS/bin/train_speedy_speech.py +++ b/TTS/bin/train_speedy_speech.py @@ -15,7 +15,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler -from TTS.tts.datasets.preprocess import load_meta_data +from TTS.tts.datasets import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import SpeedySpeechLoss from TTS.tts.models import setup_model diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index f833ffc6..69ffbb6c 100755 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -11,7 +11,7 @@ import numpy as np import torch from torch.utils.data import DataLoader -from TTS.tts.datasets.preprocess import load_meta_data +from TTS.tts.datasets import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.layers.losses import TacotronLoss from TTS.tts.models import setup_model From c392fa4288db7975b998eaf1134b2b624b205e66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 15:18:36 +0200 Subject: [PATCH 147/258] update `extract_tts_spectrograms` for the new model API --- TTS/bin/extract_tts_spectrograms.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 934055e4..e162bf4f 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -146,20 +146,22 @@ def inference( elif speaker_embeddings is not None: speaker_c = speaker_embeddings - model_output, *_ = model.inference_with_MAS( + outputs = model.inference_with_MAS( text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_c ) + model_output = outputs['model_outputs'] model_output = model_output.transpose(1, 2).detach().cpu().numpy() elif "tacotron" in model_name: - _, postnet_outputs, *_ = model( + cond_input = {'speaker_ids': speaker_ids, 'x_vectors': speaker_embeddings} + outputs = model( text_input, text_lengths, mel_input, mel_lengths, - speaker_ids=speaker_ids, - speaker_embeddings=speaker_embeddings, + cond_input ) + postnet_outputs = outputs['model_outputs'] # normalize tacotron output if model_name == "tacotron": mel_specs = [] From 5ab28fa61807d712517063e004a09e160ddc8bf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 1 Jun 2021 10:49:48 +0200 Subject: [PATCH 148/258] update `extract_tts_spec...` using `SpeakerManager` --- TTS/bin/extract_tts_spectrograms.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index e162bf4f..78830925 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -13,7 +13,7 @@ from TTS.config import load_config from TTS.tts.datasets import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.models import setup_model -from TTS.tts.utils.speakers import parse_speakers +from TTS.tts.utils.speakers import get_speaker_manager from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import count_parameters @@ -39,7 +39,9 @@ def setup_loader(ap, r, verbose=False): enable_eos_bos=c.enable_eos_bos_chars, use_noise_augment=False, verbose=verbose, - speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None, + speaker_mapping=speaker_manager.speaker_ids + if c.use_speaker_embedding and c.use_external_speaker_embedding_file + else None, ) if c.use_phonemes and c.compute_input_seq_cache: @@ -91,7 +93,7 @@ def format_data(data): speaker_embeddings = data[8] speaker_ids = None else: - speaker_ids = [speaker_mapping[speaker_name] for speaker_name in speaker_names] + speaker_ids = [speaker_manager.speaker_ids[speaker_name] for speaker_name in speaker_names] speaker_ids = torch.LongTensor(speaker_ids) speaker_embeddings = None else: @@ -134,12 +136,11 @@ def inference( text_lengths, mel_input, mel_lengths, - attn_mask=None, speaker_ids=None, speaker_embeddings=None, ): if model_name == "glow_tts": - mel_input = mel_input.permute(0, 2, 1) # B x D x T + # mel_input = mel_input.permute(0, 2, 1) # B x D x T speaker_c = None if speaker_ids is not None: speaker_c = speaker_ids @@ -147,9 +148,9 @@ def inference( speaker_c = speaker_embeddings outputs = model.inference_with_MAS( - text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_c + text_input, text_lengths, mel_input, mel_lengths, cond_input={"x_vectors": speaker_c} ) - model_output = outputs['model_outputs'] + model_output = outputs["model_outputs"] model_output = model_output.transpose(1, 2).detach().cpu().numpy() elif "tacotron" in model_name: @@ -193,7 +194,7 @@ def extract_spectrograms( speaker_embeddings, _, _, - attn_mask, + _, item_idx, ) = format_data(data) @@ -205,7 +206,6 @@ def extract_spectrograms( text_lengths, mel_input, mel_lengths, - attn_mask, speaker_ids, speaker_embeddings, ) @@ -242,7 +242,7 @@ def extract_spectrograms( def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined - global meta_data, symbols, phonemes, model_characters, speaker_mapping + global meta_data, symbols, phonemes, model_characters, speaker_manager # Audio processor ap = AudioProcessor(**c.audio) @@ -260,10 +260,10 @@ def main(args): # pylint: disable=redefined-outer-name meta_data = meta_data_train + meta_data_eval # parse speakers - num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(c, args, meta_data_train, None) + speaker_manager = get_speaker_manager(c, args, meta_data_train) # setup model - model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim=speaker_embedding_dim) + model = setup_model(num_chars, speaker_manager.num_speakers, c, speaker_embedding_dim=speaker_manager.x_vector_dim) # restore model checkpoint = torch.load(args.checkpoint_path, map_location="cpu") From 469d2e620af5cf039af6f4390917797aeba63967 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 31 May 2021 15:35:26 +0200 Subject: [PATCH 149/258] update extract_tts_spectrogram for `cond_input` API of the models --- TTS/bin/extract_tts_spectrograms.py | 1 - 1 file changed, 1 deletion(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 78830925..72df9bc7 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -140,7 +140,6 @@ def inference( speaker_embeddings=None, ): if model_name == "glow_tts": - # mel_input = mel_input.permute(0, 2, 1) # B x D x T speaker_c = None if speaker_ids is not None: speaker_c = speaker_ids From b500338faa1a1403a109731f8c2cbfdfc5dc6bc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 17:25:00 +0200 Subject: [PATCH 150/258] make style --- TTS/bin/extract_tts_spectrograms.py | 12 +- TTS/bin/train_tts.py | 9 +- TTS/trainer.py | 140 ++++------- TTS/tts/configs/align_tts_config.py | 21 +- TTS/tts/configs/glow_tts_config.py | 4 +- TTS/tts/configs/shared_configs.py | 4 +- TTS/tts/configs/speedy_speech_config.py | 18 +- TTS/tts/configs/tacotron_config.py | 20 +- TTS/tts/datasets/__init__.py | 10 +- TTS/tts/layers/losses.py | 7 +- TTS/tts/models/align_tts.py | 166 +++++-------- TTS/tts/models/glow_tts.py | 186 ++++++--------- TTS/tts/models/speedy_speech.py | 126 ++++------ TTS/tts/models/tacotron.py | 167 ++++++------- TTS/tts/models/tacotron2.py | 251 ++++++++++---------- TTS/tts/models/tacotron_abstract.py | 2 +- TTS/tts/utils/synthesis.py | 64 ++--- TTS/utils/arguments.py | 39 +-- TTS/utils/synthesizer.py | 4 +- tests/tts_tests/test_align_tts_train.py | 4 +- tests/tts_tests/test_glow_tts_train.py | 4 +- tests/tts_tests/test_speedy_speech_train.py | 4 +- tests/tts_tests/test_tacotron2_train.py | 4 +- tests/tts_tests/test_tacotron_train.py | 4 +- tests/vocoder_tests/test_melgan_train.py | 1 - 25 files changed, 524 insertions(+), 747 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 72df9bc7..016b389f 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -153,15 +153,9 @@ def inference( model_output = model_output.transpose(1, 2).detach().cpu().numpy() elif "tacotron" in model_name: - cond_input = {'speaker_ids': speaker_ids, 'x_vectors': speaker_embeddings} - outputs = model( - text_input, - text_lengths, - mel_input, - mel_lengths, - cond_input - ) - postnet_outputs = outputs['model_outputs'] + cond_input = {"speaker_ids": speaker_ids, "x_vectors": speaker_embeddings} + outputs = model(text_input, text_lengths, mel_input, mel_lengths, cond_input) + postnet_outputs = outputs["model_outputs"] # normalize tacotron output if model_name == "tacotron": mel_specs = [] diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 7cc8a25f..607a4e3b 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -8,13 +8,8 @@ from TTS.trainer import TrainerTTS def main(): # try: - args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training( - sys.argv) - trainer = TrainerTTS(args, - config, - c_logger, - tb_logger, - output_path=OUT_PATH) + args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) + trainer = TrainerTTS(args, config, c_logger, tb_logger, output_path=OUT_PATH) trainer.fit() # except KeyboardInterrupt: # remove_experiment_folder(OUT_PATH) diff --git a/TTS/trainer.py b/TTS/trainer.py index 63b9cd42..06d5d6b5 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -84,7 +84,7 @@ class TrainerTTS: self.best_loss = float("inf") self.train_loader = None self.eval_loader = None - self.output_audio_path = os.path.join(output_path, 'test_audios') + self.output_audio_path = os.path.join(output_path, "test_audios") self.keep_avg_train = None self.keep_avg_eval = None @@ -138,8 +138,8 @@ class TrainerTTS: if self.args.restore_path: self.model, self.optimizer, self.scaler, self.restore_step = self.restore_model( - self.config, args.restore_path, self.model, self.optimizer, - self.scaler) + self.config, args.restore_path, self.model, self.optimizer, self.scaler + ) # setup scheduler self.scheduler = self.get_scheduler(self.config, self.optimizer) @@ -207,6 +207,7 @@ class TrainerTTS: return None if lr_scheduler.lower() == "noamlr": from TTS.utils.training import NoamLR + scheduler = NoamLR else: scheduler = getattr(torch.optim, lr_scheduler) @@ -261,8 +262,7 @@ class TrainerTTS: ap=ap, tp=self.config.characters, add_blank=self.config["add_blank"], - batch_group_size=0 if is_eval else - self.config.batch_group_size * self.config.batch_size, + batch_group_size=0 if is_eval else self.config.batch_group_size * self.config.batch_size, min_seq_len=self.config.min_seq_len, max_seq_len=self.config.max_seq_len, phoneme_cache_path=self.config.phoneme_cache_path, @@ -272,8 +272,8 @@ class TrainerTTS: use_noise_augment=not is_eval, verbose=verbose, speaker_mapping=speaker_mapping - if self.config.use_speaker_embedding - and self.config.use_external_speaker_embedding_file else None, + if self.config.use_speaker_embedding and self.config.use_external_speaker_embedding_file + else None, ) if self.config.use_phonemes and self.config.compute_input_seq_cache: @@ -281,18 +281,15 @@ class TrainerTTS: dataset.compute_input_seq(self.config.num_loader_workers) dataset.sort_items() - sampler = DistributedSampler( - dataset) if self.num_gpus > 1 else None + sampler = DistributedSampler(dataset) if self.num_gpus > 1 else None loader = DataLoader( dataset, - batch_size=self.config.eval_batch_size - if is_eval else self.config.batch_size, + batch_size=self.config.eval_batch_size if is_eval else self.config.batch_size, shuffle=False, collate_fn=dataset.collate_fn, drop_last=False, sampler=sampler, - num_workers=self.config.num_val_loader_workers - if is_eval else self.config.num_loader_workers, + num_workers=self.config.num_val_loader_workers if is_eval else self.config.num_loader_workers, pin_memory=False, ) return loader @@ -314,8 +311,7 @@ class TrainerTTS: text_input = batch[0] text_lengths = batch[1] speaker_names = batch[2] - linear_input = batch[3] if self.config.model.lower() in ["tacotron" - ] else None + linear_input = batch[3] if self.config.model.lower() in ["tacotron"] else None mel_input = batch[4] mel_lengths = batch[5] stop_targets = batch[6] @@ -331,10 +327,7 @@ class TrainerTTS: speaker_embeddings = batch[8] speaker_ids = None else: - speaker_ids = [ - self.speaker_manager.speaker_ids[speaker_name] - for speaker_name in speaker_names - ] + speaker_ids = [self.speaker_manager.speaker_ids[speaker_name] for speaker_name in speaker_names] speaker_ids = torch.LongTensor(speaker_ids) speaker_embeddings = None else: @@ -346,7 +339,7 @@ class TrainerTTS: durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2]) for idx, am in enumerate(attn_mask): # compute raw durations - c_idxs = am[:, :text_lengths[idx], :mel_lengths[idx]].max(1)[1] + c_idxs = am[:, : text_lengths[idx], : mel_lengths[idx]].max(1)[1] # c_idxs, counts = torch.unique_consecutive(c_idxs, return_counts=True) c_idxs, counts = torch.unique(c_idxs, return_counts=True) dur = torch.ones([text_lengths[idx]]).to(counts.dtype) @@ -359,14 +352,11 @@ class TrainerTTS: assert ( dur.sum() == mel_lengths[idx] ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" - durations[idx, :text_lengths[idx]] = dur + durations[idx, : text_lengths[idx]] = dur # set stop targets view, we predict a single stop token per iteration. - stop_targets = stop_targets.view(text_input.shape[0], - stop_targets.size(1) // self.config.r, - -1) - stop_targets = (stop_targets.sum(2) > - 0.0).unsqueeze(2).float().squeeze(2) + stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // self.config.r, -1) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) # dispatch batch to GPU if self.use_cuda: @@ -374,15 +364,10 @@ class TrainerTTS: text_lengths = text_lengths.cuda(non_blocking=True) mel_input = mel_input.cuda(non_blocking=True) mel_lengths = mel_lengths.cuda(non_blocking=True) - linear_input = linear_input.cuda( - non_blocking=True) if self.config.model.lower() in [ - "tacotron" - ] else None + linear_input = linear_input.cuda(non_blocking=True) if self.config.model.lower() in ["tacotron"] else None stop_targets = stop_targets.cuda(non_blocking=True) - attn_mask = attn_mask.cuda( - non_blocking=True) if attn_mask is not None else None - durations = durations.cuda( - non_blocking=True) if attn_mask is not None else None + attn_mask = attn_mask.cuda(non_blocking=True) if attn_mask is not None else None + durations = durations.cuda(non_blocking=True) if attn_mask is not None else None if speaker_ids is not None: speaker_ids = speaker_ids.cuda(non_blocking=True) if speaker_embeddings is not None: @@ -401,7 +386,7 @@ class TrainerTTS: "x_vectors": speaker_embeddings, "max_text_length": max_text_length, "max_spec_length": max_spec_length, - "item_idx": item_idx + "item_idx": item_idx, } def train_step(self, batch: Dict, batch_n_steps: int, step: int, @@ -421,25 +406,20 @@ class TrainerTTS: # check nan loss if torch.isnan(loss_dict["loss"]).any(): - raise RuntimeError( - f"Detected NaN loss at step {self.total_steps_done}.") + raise RuntimeError(f"Detected NaN loss at step {self.total_steps_done}.") # optimizer step if self.config.mixed_precision: # model optimizer step in mixed precision mode self.scaler.scale(loss_dict["loss"]).backward() self.scaler.unscale_(self.optimizer) - grad_norm, _ = check_update(self.model, - self.config.grad_clip, - ignore_stopnet=True) + grad_norm, _ = check_update(self.model, self.config.grad_clip, ignore_stopnet=True) self.scaler.step(self.optimizer) self.scaler.update() else: # main model optimizer step loss_dict["loss"].backward() - grad_norm, _ = check_update(self.model, - self.config.grad_clip, - ignore_stopnet=True) + grad_norm, _ = check_update(self.model, self.config.grad_clip, ignore_stopnet=True) self.optimizer.step() step_time = time.time() - step_start_time @@ -469,17 +449,15 @@ class TrainerTTS: current_lr = self.optimizer.param_groups[0]["lr"] if self.total_steps_done % self.config.print_step == 0: log_dict = { - "max_spec_length": [batch["max_spec_length"], - 1], # value, precision + "max_spec_length": [batch["max_spec_length"], 1], # value, precision "max_text_length": [batch["max_text_length"], 1], "step_time": [step_time, 4], "loader_time": [loader_time, 2], "current_lr": current_lr, } - self.c_logger.print_train_step(batch_n_steps, step, - self.total_steps_done, log_dict, - loss_dict, - self.keep_avg_train.avg_values) + self.c_logger.print_train_step( + batch_n_steps, step, self.total_steps_done, log_dict, loss_dict, self.keep_avg_train.avg_values + ) if self.args.rank == 0: # Plot Training Iter Stats @@ -491,8 +469,7 @@ class TrainerTTS: "step_time": step_time, } iter_stats.update(loss_dict) - self.tb_logger.tb_train_step_stats(self.total_steps_done, - iter_stats) + self.tb_logger.tb_train_step_stats(self.total_steps_done, iter_stats) if self.total_steps_done % self.config.save_step == 0: if self.config.checkpoint: @@ -506,15 +483,12 @@ class TrainerTTS: self.output_path, model_loss=loss_dict["loss"], characters=self.model_characters, - scaler=self.scaler.state_dict() - if self.config.mixed_precision else None, + scaler=self.scaler.state_dict() if self.config.mixed_precision else None, ) # training visualizations figures, audios = self.model.train_log(self.ap, batch, outputs) self.tb_logger.tb_train_figures(self.total_steps_done, figures) - self.tb_logger.tb_train_audios(self.total_steps_done, - {"TrainAudio": audios}, - self.ap.sample_rate) + self.tb_logger.tb_train_audios(self.total_steps_done, {"TrainAudio": audios}, self.ap.sample_rate) self.total_steps_done += 1 self.on_train_step_end() return outputs, loss_dict @@ -523,35 +497,28 @@ class TrainerTTS: self.model.train() epoch_start_time = time.time() if self.use_cuda: - batch_num_steps = int( - len(self.train_loader.dataset) / - (self.config.batch_size * self.num_gpus)) + batch_num_steps = int(len(self.train_loader.dataset) / (self.config.batch_size * self.num_gpus)) else: - batch_num_steps = int( - len(self.train_loader.dataset) / self.config.batch_size) + batch_num_steps = int(len(self.train_loader.dataset) / self.config.batch_size) self.c_logger.print_train_start() loader_start_time = time.time() for cur_step, batch in enumerate(self.train_loader): - _, _ = self.train_step(batch, batch_num_steps, cur_step, - loader_start_time) + _, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time) epoch_time = time.time() - epoch_start_time # Plot self.epochs_done Stats if self.args.rank == 0: epoch_stats = {"epoch_time": epoch_time} epoch_stats.update(self.keep_avg_train.avg_values) - self.tb_logger.tb_train_epoch_stats(self.total_steps_done, - epoch_stats) + self.tb_logger.tb_train_epoch_stats(self.total_steps_done, epoch_stats) if self.config.tb_model_param_stats: - self.tb_logger.tb_model_weights(self.model, - self.total_steps_done) + self.tb_logger.tb_model_weights(self.model, self.total_steps_done) def eval_step(self, batch: Dict, step: int) -> Tuple[Dict, Dict]: with torch.no_grad(): step_start_time = time.time() with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): - outputs, loss_dict = self.model.eval_step( - batch, self.criterion) + outputs, loss_dict = self.model.eval_step(batch, self.criterion) step_time = time.time() - step_start_time @@ -572,8 +539,7 @@ class TrainerTTS: self.keep_avg_eval.update_values(update_eval_values) if self.config.print_eval: - self.c_logger.print_eval_step(step, loss_dict, - self.keep_avg_eval.avg_values) + self.c_logger.print_eval_step(step, loss_dict, self.keep_avg_eval.avg_values) return outputs, loss_dict def eval_epoch(self) -> None: @@ -585,15 +551,13 @@ class TrainerTTS: # format data batch = self.format_batch(batch) loader_time = time.time() - loader_start_time - self.keep_avg_eval.update_values({'avg_loader_time': loader_time}) + self.keep_avg_eval.update_values({"avg_loader_time": loader_time}) outputs, _ = self.eval_step(batch, cur_step) # Plot epoch stats and samples from the last batch. if self.args.rank == 0: figures, eval_audios = self.model.eval_log(self.ap, batch, outputs) self.tb_logger.tb_eval_figures(self.total_steps_done, figures) - self.tb_logger.tb_eval_audios(self.total_steps_done, - {"EvalAudio": eval_audios}, - self.ap.sample_rate) + self.tb_logger.tb_eval_audios(self.total_steps_done, {"EvalAudio": eval_audios}, self.ap.sample_rate) def test_run(self, ) -> None: print(" | > Synthesizing test sentences.") @@ -608,9 +572,9 @@ class TrainerTTS: self.config, self.use_cuda, self.ap, - speaker_id=cond_inputs['speaker_id'], - x_vector=cond_inputs['x_vector'], - style_wav=cond_inputs['style_wav'], + speaker_id=cond_inputs["speaker_id"], + x_vector=cond_inputs["x_vector"], + style_wav=cond_inputs["style_wav"], enable_eos_bos_chars=self.config.enable_eos_bos_chars, use_griffin_lim=True, do_trim_silence=False, @@ -623,10 +587,8 @@ class TrainerTTS: "TestSentence_{}.wav".format(idx)) self.ap.save_wav(wav, file_path) test_audios["{}-audio".format(idx)] = wav - test_figures["{}-prediction".format(idx)] = plot_spectrogram( - model_outputs, self.ap, output_fig=False) - test_figures["{}-alignment".format(idx)] = plot_alignment( - alignment, output_fig=False) + test_figures["{}-prediction".format(idx)] = plot_spectrogram(model_outputs, self.ap, output_fig=False) + test_figures["{}-alignment".format(idx)] = plot_alignment(alignment, output_fig=False) self.tb_logger.tb_test_audios(self.total_steps_done, test_audios, self.config.audio["sample_rate"]) @@ -641,11 +603,11 @@ class TrainerTTS: if self.config.use_external_speaker_embedding_file and self.config.use_speaker_embedding else None) # setup style_mel - if self.config.has('gst_style_input'): + if self.config.has("gst_style_input"): style_wav = self.config.gst_style_input else: style_wav = None - if style_wav is None and 'use_gst' in self.config and self.config.use_gst: + if style_wav is None and "use_gst" in self.config and self.config.use_gst: # inicialize GST with zero dict. style_wav = {} print( @@ -688,8 +650,7 @@ class TrainerTTS: for epoch in range(0, self.config.epochs): self.on_epoch_start() self.keep_avg_train = KeepAverage() - self.keep_avg_eval = KeepAverage( - ) if self.config.run_eval else None + self.keep_avg_eval = KeepAverage() if self.config.run_eval else None self.epochs_done = epoch self.c_logger.print_epoch_start(epoch, self.config.epochs) self.train_epoch() @@ -698,8 +659,8 @@ class TrainerTTS: if epoch >= self.config.test_delay_epochs: self.test_run() self.c_logger.print_epoch_end( - epoch, self.keep_avg_eval.avg_values - if self.config.run_eval else self.keep_avg_train.avg_values) + epoch, self.keep_avg_eval.avg_values if self.config.run_eval else self.keep_avg_train.avg_values + ) self.save_best_model() self.on_epoch_end() @@ -717,8 +678,7 @@ class TrainerTTS: self.model_characters, keep_all_best=self.config.keep_all_best, keep_after=self.config.keep_after, - scaler=self.scaler.state_dict() - if self.config.mixed_precision else None, + scaler=self.scaler.state_dict() if self.config.mixed_precision else None, ) @staticmethod diff --git a/TTS/tts/configs/align_tts_config.py b/TTS/tts/configs/align_tts_config.py index 115e969c..56622741 100644 --- a/TTS/tts/configs/align_tts_config.py +++ b/TTS/tts/configs/align_tts_config.py @@ -93,7 +93,7 @@ class AlignTTSConfig(BaseTTSConfig): # optimizer parameters optimizer: str = "Adam" - optimizer_params: dict = field(default_factory=lambda: {'betas': [0.9, 0.998], 'weight_decay': 1e-6}) + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6}) lr_scheduler: str = None lr_scheduler_params: dict = None lr: float = 1e-4 @@ -104,12 +104,13 @@ class AlignTTSConfig(BaseTTSConfig): max_seq_len: int = 200 r: int = 1 - # testing - test_sentences: List[str] = field(default_factory=lambda:[ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963." - ]) - + # testing + test_sentences: List[str] = field( + default_factory=lambda: [ + "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + "Be a voice, not an echo.", + "I'm sorry Dave. I'm afraid I can't do that.", + "This cake is great. It's so delicious and moist.", + "Prior to November 22, 1963.", + ] + ) diff --git a/TTS/tts/configs/glow_tts_config.py b/TTS/tts/configs/glow_tts_config.py index 214b2377..925854c9 100644 --- a/TTS/tts/configs/glow_tts_config.py +++ b/TTS/tts/configs/glow_tts_config.py @@ -91,9 +91,9 @@ class GlowTTSConfig(BaseTTSConfig): # optimizer parameters optimizer: str = "RAdam" - optimizer_params: dict = field(default_factory=lambda: {'betas': [0.9, 0.998], 'weight_decay': 1e-6}) + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6}) lr_scheduler: str = "NoamLR" - lr_scheduler_params: dict = field(default_factory=lambda:{"warmup_steps": 4000}) + lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000}) grad_clip: float = 5.0 lr: float = 1e-3 diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index a2d935c7..d02e58ae 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -174,7 +174,7 @@ class BaseTTSConfig(BaseTrainingConfig): optimizer: str = MISSING optimizer_params: dict = MISSING # scheduler - lr_scheduler: str = '' + lr_scheduler: str = "" lr_scheduler_params: dict = field(default_factory=lambda: {}) # testing - test_sentences: List[str] = field(default_factory=lambda:[]) + test_sentences: List[str] = field(default_factory=lambda: []) diff --git a/TTS/tts/configs/speedy_speech_config.py b/TTS/tts/configs/speedy_speech_config.py index 42258398..d76d94e2 100644 --- a/TTS/tts/configs/speedy_speech_config.py +++ b/TTS/tts/configs/speedy_speech_config.py @@ -101,7 +101,7 @@ class SpeedySpeechConfig(BaseTTSConfig): # optimizer parameters optimizer: str = "RAdam" - optimizer_params: dict = field(default_factory=lambda: {'betas': [0.9, 0.998], 'weight_decay': 1e-6}) + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6}) lr_scheduler: str = None lr_scheduler_params: dict = None lr: float = 1e-4 @@ -118,10 +118,12 @@ class SpeedySpeechConfig(BaseTTSConfig): r: int = 1 # DO NOT CHANGE # testing - test_sentences: List[str] = field(default_factory=lambda:[ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963." - ]) + test_sentences: List[str] = field( + default_factory=lambda: [ + "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + "Be a voice, not an echo.", + "I'm sorry Dave. I'm afraid I can't do that.", + "This cake is great. It's so delicious and moist.", + "Prior to November 22, 1963.", + ] + ) diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index 90decaa3..b197eaf6 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -160,9 +160,9 @@ class TacotronConfig(BaseTTSConfig): # optimizer parameters optimizer: str = "RAdam" - optimizer_params: dict = field(default_factory=lambda: {'betas': [0.9, 0.998], 'weight_decay': 1e-6}) + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6}) lr_scheduler: str = "NoamLR" - lr_scheduler_params: dict = field(default_factory=lambda:{"warmup_steps": 4000}) + lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000}) lr: float = 1e-4 grad_clip: float = 5.0 seq_len_norm: bool = False @@ -178,13 +178,15 @@ class TacotronConfig(BaseTTSConfig): ga_alpha: float = 5.0 # testing - test_sentences: List[str] = field(default_factory=lambda:[ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963." - ]) + test_sentences: List[str] = field( + default_factory=lambda: [ + "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + "Be a voice, not an echo.", + "I'm sorry Dave. I'm afraid I can't do that.", + "This cake is great. It's so delicious and moist.", + "Prior to November 22, 1963.", + ] + ) def check_values(self): if self.gradual_training: diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index b238209f..69ab871d 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -44,22 +44,18 @@ def load_meta_data(datasets, eval_split=True): preprocessor = _get_preprocessor_by_name(name) # load train set meta_data_train = preprocessor(root_path, meta_file_train) - print( - f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}" - ) + print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}") # load evaluation split if set if eval_split: if meta_file_val: meta_data_eval = preprocessor(root_path, meta_file_val) else: - meta_data_eval, meta_data_train = split_dataset( - meta_data_train) + meta_data_eval, meta_data_train = split_dataset(meta_data_train) meta_data_eval_all += meta_data_eval meta_data_train_all += meta_data_train # load attention masks for duration predictor training if dataset.meta_file_attn_mask: - meta_data = dict( - load_attention_mask_meta_data(dataset["meta_file_attn_mask"])) + meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"])) for idx, ins in enumerate(meta_data_train_all): attn_file = meta_data[ins[1]].strip() meta_data_train_all[idx].append(attn_file) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 517eb533..86d34c30 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -506,5 +506,10 @@ class AlignTTSLoss(nn.Module): spec_loss = self.spec_loss(decoder_output, decoder_target, decoder_output_lens) ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens) dur_loss = self.dur_loss(dur_output.unsqueeze(2), dur_target.unsqueeze(2), input_lens) - loss = self.spec_loss_alpha * spec_loss + self.ssim_alpha * ssim_loss + self.dur_loss_alpha * dur_loss + self.mdn_alpha * mdn_loss + loss = ( + self.spec_loss_alpha * spec_loss + + self.ssim_alpha * ssim_loss + + self.dur_loss_alpha * dur_loss + + self.mdn_alpha * mdn_loss + ) return {"loss": loss, "loss_l1": spec_loss, "loss_ssim": ssim_loss, "loss_dur": dur_loss, "mdn_loss": mdn_loss} diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 6d61eae2..f94d9ca6 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -72,19 +72,9 @@ class AlignTTS(nn.Module): hidden_channels=256, hidden_channels_dp=256, encoder_type="fftransformer", - encoder_params={ - "hidden_channels_ffn": 1024, - "num_heads": 2, - "num_layers": 6, - "dropout_p": 0.1 - }, + encoder_params={"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}, decoder_type="fftransformer", - decoder_params={ - "hidden_channels_ffn": 1024, - "num_heads": 2, - "num_layers": 6, - "dropout_p": 0.1 - }, + decoder_params={"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}, length_scale=1, num_speakers=0, external_c=False, @@ -93,14 +83,11 @@ class AlignTTS(nn.Module): super().__init__() self.phase = -1 - self.length_scale = float(length_scale) if isinstance( - length_scale, int) else length_scale + self.length_scale = float(length_scale) if isinstance(length_scale, int) else length_scale self.emb = nn.Embedding(num_chars, hidden_channels) self.pos_encoder = PositionalEncoding(hidden_channels) - self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, - encoder_params, c_in_channels) - self.decoder = Decoder(out_channels, hidden_channels, decoder_type, - decoder_params) + self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, encoder_params, c_in_channels) + self.decoder = Decoder(out_channels, hidden_channels, decoder_type, decoder_params) self.duration_predictor = DurationPredictor(hidden_channels_dp) self.mod_layer = nn.Conv1d(hidden_channels, hidden_channels, 1) @@ -121,9 +108,9 @@ class AlignTTS(nn.Module): mu = mu.transpose(1, 2).unsqueeze(2) # [B, T2, 1, D] log_sigma = log_sigma.transpose(1, 2).unsqueeze(2) # [B, T2, 1, D] expanded_y, expanded_mu = torch.broadcast_tensors(y, mu) - exponential = -0.5 * torch.mean(torch._C._nn.mse_loss( - expanded_y, expanded_mu, 0) / torch.pow(log_sigma.exp(), 2), - dim=-1) # B, L, T + exponential = -0.5 * torch.mean( + torch._C._nn.mse_loss(expanded_y, expanded_mu, 0) / torch.pow(log_sigma.exp(), 2), dim=-1 + ) # B, L, T logp = exponential - 0.5 * log_sigma.mean(dim=-1) return logp @@ -157,9 +144,7 @@ class AlignTTS(nn.Module): [1, 0, 0, 0, 0, 0, 0]] """ attn = self.convert_dr_to_align(dr, x_mask, y_mask) - o_en_ex = torch.matmul( - attn.squeeze(1).transpose(1, 2), en.transpose(1, - 2)).transpose(1, 2) + o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2), en.transpose(1, 2)).transpose(1, 2) return o_en_ex, attn def format_durations(self, o_dr_log, x_mask): @@ -193,8 +178,7 @@ class AlignTTS(nn.Module): x_emb = torch.transpose(x_emb, 1, -1) # compute sequence masks - x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), - 1).to(x.dtype) + x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).to(x.dtype) # encoder pass o_en = self.encoder(x_emb, x_mask) @@ -207,8 +191,7 @@ class AlignTTS(nn.Module): return o_en, o_en_dp, x_mask, g def _forward_decoder(self, o_en, o_en_dp, dr, x_mask, y_lengths, g): - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), - 1).to(o_en_dp.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype) # expand o_en with durations o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask) # positional encoding @@ -224,13 +207,13 @@ class AlignTTS(nn.Module): def _forward_mdn(self, o_en, y, y_lengths, x_mask): # MAS potentials and alignment mu, log_sigma = self.mdn_block(o_en) - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), - 1).to(o_en.dtype) - dr_mas, logp = self.compute_align_path(mu, log_sigma, y, x_mask, - y_mask) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en.dtype) + dr_mas, logp = self.compute_align_path(mu, log_sigma, y, x_mask, y_mask) return dr_mas, mu, log_sigma, logp - def forward(self, x, x_lengths, y, y_lengths, cond_input={"x_vectors": None}, phase=None): # pylint: disable=unused-argument + def forward( + self, x, x_lengths, y, y_lengths, cond_input={"x_vectors": None}, phase=None + ): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] @@ -240,83 +223,58 @@ class AlignTTS(nn.Module): g: [B, C] """ y = y.transpose(1, 2) - g = cond_input['x_vectors'] if 'x_vectors' in cond_input else None + g = cond_input["x_vectors"] if "x_vectors" in cond_input else None o_de, o_dr_log, dr_mas_log, attn, mu, log_sigma, logp = None, None, None, None, None, None, None if phase == 0: # train encoder and MDN o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) - dr_mas, mu, log_sigma, logp = self._forward_mdn( - o_en, y, y_lengths, x_mask) - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), - 1).to(o_en_dp.dtype) + dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype) attn = self.convert_dr_to_align(dr_mas, x_mask, y_mask) elif phase == 1: # train decoder o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) dr_mas, _, _, _ = self._forward_mdn(o_en, y, y_lengths, x_mask) - o_de, attn = self._forward_decoder(o_en.detach(), - o_en_dp.detach(), - dr_mas.detach(), - x_mask, - y_lengths, - g=g) + o_de, attn = self._forward_decoder(o_en.detach(), o_en_dp.detach(), dr_mas.detach(), x_mask, y_lengths, g=g) elif phase == 2: # train the whole except duration predictor o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) - dr_mas, mu, log_sigma, logp = self._forward_mdn( - o_en, y, y_lengths, x_mask) - o_de, attn = self._forward_decoder(o_en, - o_en_dp, - dr_mas, - x_mask, - y_lengths, - g=g) + dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask) + o_de, attn = self._forward_decoder(o_en, o_en_dp, dr_mas, x_mask, y_lengths, g=g) elif phase == 3: # train duration predictor o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) o_dr_log = self.duration_predictor(x, x_mask) - dr_mas, mu, log_sigma, logp = self._forward_mdn( - o_en, y, y_lengths, x_mask) - o_de, attn = self._forward_decoder(o_en, - o_en_dp, - dr_mas, - x_mask, - y_lengths, - g=g) + dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask) + o_de, attn = self._forward_decoder(o_en, o_en_dp, dr_mas, x_mask, y_lengths, g=g) o_dr_log = o_dr_log.squeeze(1) else: o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) - dr_mas, mu, log_sigma, logp = self._forward_mdn( - o_en, y, y_lengths, x_mask) - o_de, attn = self._forward_decoder(o_en, - o_en_dp, - dr_mas, - x_mask, - y_lengths, - g=g) + dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask) + o_de, attn = self._forward_decoder(o_en, o_en_dp, dr_mas, x_mask, y_lengths, g=g) o_dr_log = o_dr_log.squeeze(1) dr_mas_log = torch.log(dr_mas + 1).squeeze(1) outputs = { - 'model_outputs': o_de.transpose(1, 2), - 'alignments': attn, - 'durations_log': o_dr_log, - 'durations_mas_log': dr_mas_log, - 'mu': mu, - 'log_sigma': log_sigma, - 'logp': logp + "model_outputs": o_de.transpose(1, 2), + "alignments": attn, + "durations_log": o_dr_log, + "durations_mas_log": dr_mas_log, + "mu": mu, + "log_sigma": log_sigma, + "logp": logp, } return outputs @torch.no_grad() - def inference(self, x, cond_input={'x_vectors': None}): # pylint: disable=unused-argument + def inference(self, x, cond_input={"x_vectors": None}): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] x_lengths: [B] g: [B, C] """ - g = cond_input['x_vectors'] if 'x_vectors' in cond_input else None + g = cond_input["x_vectors"] if "x_vectors" in cond_input else None x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # pad input to prevent dropping the last word # x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0) @@ -326,46 +284,40 @@ class AlignTTS(nn.Module): # duration predictor pass o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1) y_lengths = o_dr.sum(1) - o_de, attn = self._forward_decoder(o_en, - o_en_dp, - o_dr, - x_mask, - y_lengths, - g=g) - outputs = {'model_outputs': o_de.transpose(1, 2), 'alignments': attn} + o_de, attn = self._forward_decoder(o_en, o_en_dp, o_dr, x_mask, y_lengths, g=g) + outputs = {"model_outputs": o_de.transpose(1, 2), "alignments": attn} return outputs def train_step(self, batch: dict, criterion: nn.Module): - text_input = batch['text_input'] - text_lengths = batch['text_lengths'] - mel_input = batch['mel_input'] - mel_lengths = batch['mel_lengths'] - x_vectors = batch['x_vectors'] - speaker_ids = batch['speaker_ids'] + text_input = batch["text_input"] + text_lengths = batch["text_lengths"] + mel_input = batch["mel_input"] + mel_lengths = batch["mel_lengths"] + x_vectors = batch["x_vectors"] + speaker_ids = batch["speaker_ids"] - cond_input = {'x_vectors': x_vectors, 'speaker_ids': speaker_ids} + cond_input = {"x_vectors": x_vectors, "speaker_ids": speaker_ids} outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input, self.phase) loss_dict = criterion( - outputs['logp'], - outputs['model_outputs'], - mel_input, - mel_lengths, - outputs['durations_log'], - outputs['durations_mas_log'], - text_lengths, - phase=self.phase, - ) + outputs["logp"], + outputs["model_outputs"], + mel_input, + mel_lengths, + outputs["durations_log"], + outputs["durations_mas_log"], + text_lengths, + phase=self.phase, + ) # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(outputs['alignments'], - binary=True) + align_error = 1 - alignment_diagonal_score(outputs["alignments"], binary=True) loss_dict["align_error"] = align_error return outputs, loss_dict def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): - model_outputs = outputs['model_outputs'] - alignments = outputs['alignments'] - mel_input = batch['mel_input'] + model_outputs = outputs["model_outputs"] + alignments = outputs["alignments"] + mel_input = batch["mel_input"] pred_spec = model_outputs[0].data.cpu().numpy() gt_spec = mel_input[0].data.cpu().numpy() @@ -387,7 +339,9 @@ class AlignTTS(nn.Module): def eval_log(self, ap: AudioProcessor, batch: dict, outputs: dict): return self.train_log(ap, batch, outputs) - def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint( + self, config, checkpoint_path, eval=False + ): # pylint: disable=unused-argument, redefined-builtin state = torch.load(checkpoint_path, map_location=torch.device("cpu")) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 09e58ce7..e1c07212 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -38,6 +38,7 @@ class GlowTTS(nn.Module): encoder_params (dict): encoder module parameters. speaker_embedding_dim (int): channels of external speaker embedding vectors. """ + def __init__( self, num_chars, @@ -132,17 +133,17 @@ class GlowTTS(nn.Module): @staticmethod def compute_outputs(attn, o_mean, o_log_scale, x_mask): # compute final values with the computed alignment - y_mean = torch.matmul( - attn.squeeze(1).transpose(1, 2), o_mean.transpose(1, 2)).transpose( - 1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] - y_log_scale = torch.matmul( - attn.squeeze(1).transpose(1, 2), o_log_scale.transpose( - 1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t'] + y_mean = torch.matmul(attn.squeeze(1).transpose(1, 2), o_mean.transpose(1, 2)).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] + y_log_scale = torch.matmul(attn.squeeze(1).transpose(1, 2), o_log_scale.transpose(1, 2)).transpose( + 1, 2 + ) # [b, t', t], [b, t, d] -> [b, d, t'] # compute total duration with adjustment o_attn_dur = torch.log(1 + torch.sum(attn, -1)) * x_mask return y_mean, y_log_scale, o_attn_dur - def forward(self, x, x_lengths, y, y_lengths=None, cond_input={'x_vectors':None}): + def forward(self, x, x_lengths, y, y_lengths=None, cond_input={"x_vectors": None}): """ Shapes: x: [B, T] @@ -154,7 +155,7 @@ class GlowTTS(nn.Module): y_max_length = y.size(2) y = y.transpose(1, 2) # norm speaker embeddings - g = cond_input['x_vectors'] + g = cond_input["x_vectors"] if g is not None: if self.speaker_embedding_dim: g = F.normalize(g).unsqueeze(-1) @@ -162,54 +163,38 @@ class GlowTTS(nn.Module): g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] # embedding pass - o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, - x_lengths, - g=g) + o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g) # drop redisual frames wrt num_squeeze and set y_lengths. - y, y_lengths, y_max_length, attn = self.preprocess( - y, y_lengths, y_max_length, None) + y, y_lengths, y_max_length, attn = self.preprocess(y, y_lengths, y_max_length, None) # create masks - y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), - 1).to(x_mask.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype) attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) # decoder pass z, logdet = self.decoder(y, y_mask, g=g, reverse=False) # find the alignment path with torch.no_grad(): o_scale = torch.exp(-2 * o_log_scale) - logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, - [1]).unsqueeze(-1) # [b, t, 1] - logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * - (z**2)) # [b, t, d] x [b, d, t'] = [b, t, t'] - logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), - z) # [b, t, d] x [b, d, t'] = [b, t, t'] - logp4 = torch.sum(-0.5 * (o_mean**2) * o_scale, - [1]).unsqueeze(-1) # [b, t, 1] + logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, [1]).unsqueeze(-1) # [b, t, 1] + logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * (z ** 2)) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), z) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp4 = torch.sum(-0.5 * (o_mean ** 2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] logp = logp1 + logp2 + logp3 + logp4 # [b, t, t'] - attn = maximum_path(logp, - attn_mask.squeeze(1)).unsqueeze(1).detach() - y_mean, y_log_scale, o_attn_dur = self.compute_outputs( - attn, o_mean, o_log_scale, x_mask) + attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() + y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask) attn = attn.squeeze(1).permute(0, 2, 1) outputs = { - 'model_outputs': z, - 'logdet': logdet, - 'y_mean': y_mean, - 'y_log_scale': y_log_scale, - 'alignments': attn, - 'durations_log': o_dur_log, - 'total_durations_log': o_attn_dur + "model_outputs": z, + "logdet": logdet, + "y_mean": y_mean, + "y_log_scale": y_log_scale, + "alignments": attn, + "durations_log": o_dur_log, + "total_durations_log": o_attn_dur, } return outputs @torch.no_grad() - def inference_with_MAS(self, - x, - x_lengths, - y=None, - y_lengths=None, - attn=None, - g=None): + def inference_with_MAS(self, x, x_lengths, y=None, y_lengths=None, attn=None, g=None): """ It's similar to the teacher forcing in Tacotron. It was proposed in: https://arxiv.org/abs/2104.05557 @@ -229,33 +214,24 @@ class GlowTTS(nn.Module): g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] # embedding pass - o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, - x_lengths, - g=g) + o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g) # drop redisual frames wrt num_squeeze and set y_lengths. - y, y_lengths, y_max_length, attn = self.preprocess( - y, y_lengths, y_max_length, None) + y, y_lengths, y_max_length, attn = self.preprocess(y, y_lengths, y_max_length, None) # create masks - y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), - 1).to(x_mask.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype) attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) # decoder pass z, logdet = self.decoder(y, y_mask, g=g, reverse=False) # find the alignment path between z and encoder output o_scale = torch.exp(-2 * o_log_scale) - logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, - [1]).unsqueeze(-1) # [b, t, 1] - logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * - (z**2)) # [b, t, d] x [b, d, t'] = [b, t, t'] - logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), - z) # [b, t, d] x [b, d, t'] = [b, t, t'] - logp4 = torch.sum(-0.5 * (o_mean**2) * o_scale, - [1]).unsqueeze(-1) # [b, t, 1] + logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - o_log_scale, [1]).unsqueeze(-1) # [b, t, 1] + logp2 = torch.matmul(o_scale.transpose(1, 2), -0.5 * (z ** 2)) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp3 = torch.matmul((o_mean * o_scale).transpose(1, 2), z) # [b, t, d] x [b, d, t'] = [b, t, t'] + logp4 = torch.sum(-0.5 * (o_mean ** 2) * o_scale, [1]).unsqueeze(-1) # [b, t, 1] logp = logp1 + logp2 + logp3 + logp4 # [b, t, t'] attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() - y_mean, y_log_scale, o_attn_dur = self.compute_outputs( - attn, o_mean, o_log_scale, x_mask) + y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask) attn = attn.squeeze(1).permute(0, 2, 1) # get predited aligned distribution @@ -264,13 +240,13 @@ class GlowTTS(nn.Module): # reverse the decoder and predict using the aligned distribution y, logdet = self.decoder(z, y_mask, g=g, reverse=True) outputs = { - 'model_outputs': y, - 'logdet': logdet, - 'y_mean': y_mean, - 'y_log_scale': y_log_scale, - 'alignments': attn, - 'durations_log': o_dur_log, - 'total_durations_log': o_attn_dur + "model_outputs": y, + "logdet": logdet, + "y_mean": y_mean, + "y_log_scale": y_log_scale, + "alignments": attn, + "durations_log": o_dur_log, + "total_durations_log": o_attn_dur, } return outputs @@ -290,8 +266,7 @@ class GlowTTS(nn.Module): else: g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] - y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), - 1).to(y.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(y.dtype) # decoder pass z, logdet = self.decoder(y, y_mask, g=g, reverse=False) @@ -310,37 +285,31 @@ class GlowTTS(nn.Module): g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h] # embedding pass - o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, - x_lengths, - g=g) + o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g) # compute output durations w = (torch.exp(o_dur_log) - 1) * x_mask * self.length_scale w_ceil = torch.ceil(w) y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long() y_max_length = None # compute masks - y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), - 1).to(x_mask.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype) attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) # compute attention mask - attn = generate_path(w_ceil.squeeze(1), - attn_mask.squeeze(1)).unsqueeze(1) - y_mean, y_log_scale, o_attn_dur = self.compute_outputs( - attn, o_mean, o_log_scale, x_mask) + attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1) + y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask) - z = (y_mean + torch.exp(y_log_scale) * torch.randn_like(y_mean) * - self.inference_noise_scale) * y_mask + z = (y_mean + torch.exp(y_log_scale) * torch.randn_like(y_mean) * self.inference_noise_scale) * y_mask # decoder pass y, logdet = self.decoder(z, y_mask, g=g, reverse=True) attn = attn.squeeze(1).permute(0, 2, 1) outputs = { - 'model_outputs': y, - 'logdet': logdet, - 'y_mean': y_mean, - 'y_log_scale': y_log_scale, - 'alignments': attn, - 'durations_log': o_dur_log, - 'total_durations_log': o_attn_dur + "model_outputs": y, + "logdet": logdet, + "y_mean": y_mean, + "y_log_scale": y_log_scale, + "alignments": attn, + "durations_log": o_dur_log, + "total_durations_log": o_attn_dur, } return outputs @@ -351,32 +320,34 @@ class GlowTTS(nn.Module): batch (dict): [description] criterion (nn.Module): [description] """ - text_input = batch['text_input'] - text_lengths = batch['text_lengths'] - mel_input = batch['mel_input'] - mel_lengths = batch['mel_lengths'] - x_vectors = batch['x_vectors'] + text_input = batch["text_input"] + text_lengths = batch["text_lengths"] + mel_input = batch["mel_input"] + mel_lengths = batch["mel_lengths"] + x_vectors = batch["x_vectors"] - outputs = self.forward(text_input, - text_lengths, - mel_input, - mel_lengths, - cond_input={"x_vectors": x_vectors}) + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input={"x_vectors": x_vectors}) - loss_dict = criterion(outputs['model_outputs'], outputs['y_mean'], - outputs['y_log_scale'], outputs['logdet'], - mel_lengths, outputs['durations_log'], - outputs['total_durations_log'], text_lengths) + loss_dict = criterion( + outputs["model_outputs"], + outputs["y_mean"], + outputs["y_log_scale"], + outputs["logdet"], + mel_lengths, + outputs["durations_log"], + outputs["total_durations_log"], + text_lengths, + ) - # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(outputs['alignments'], binary=True) + # compute alignment error (the lower the better ) + align_error = 1 - alignment_diagonal_score(outputs["alignments"], binary=True) loss_dict["align_error"] = align_error return outputs, loss_dict def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): - model_outputs = outputs['model_outputs'] - alignments = outputs['alignments'] - mel_input = batch['mel_input'] + model_outputs = outputs["model_outputs"] + alignments = outputs["alignments"] + mel_input = batch["mel_input"] pred_spec = model_outputs[0].data.cpu().numpy() gt_spec = mel_input[0].data.cpu().numpy() @@ -400,8 +371,7 @@ class GlowTTS(nn.Module): def preprocess(self, y, y_lengths, y_max_length, attn=None): if y_max_length is not None: - y_max_length = (y_max_length // - self.num_squeeze) * self.num_squeeze + y_max_length = (y_max_length // self.num_squeeze) * self.num_squeeze y = y[:, :, :y_max_length] if attn is not None: attn = attn[:, :, :, :y_max_length] @@ -411,7 +381,9 @@ class GlowTTS(nn.Module): def store_inverse(self): self.decoder.store_inverse() - def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint( + self, config, checkpoint_path, eval=False + ): # pylint: disable=unused-argument, redefined-builtin state = torch.load(checkpoint_path, map_location=torch.device("cpu")) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index daf67b6c..69070ffa 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -49,12 +49,7 @@ class SpeedySpeech(nn.Module): positional_encoding=True, length_scale=1, encoder_type="residual_conv_bn", - encoder_params={ - "kernel_size": 4, - "dilations": 4 * [1, 2, 4] + [1], - "num_conv_blocks": 2, - "num_res_blocks": 13 - }, + encoder_params={"kernel_size": 4, "dilations": 4 * [1, 2, 4] + [1], "num_conv_blocks": 2, "num_res_blocks": 13}, decoder_type="residual_conv_bn", decoder_params={ "kernel_size": 4, @@ -68,17 +63,13 @@ class SpeedySpeech(nn.Module): ): super().__init__() - self.length_scale = float(length_scale) if isinstance( - length_scale, int) else length_scale + self.length_scale = float(length_scale) if isinstance(length_scale, int) else length_scale self.emb = nn.Embedding(num_chars, hidden_channels) - self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, - encoder_params, c_in_channels) + self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, encoder_params, c_in_channels) if positional_encoding: self.pos_encoder = PositionalEncoding(hidden_channels) - self.decoder = Decoder(out_channels, hidden_channels, decoder_type, - decoder_params) - self.duration_predictor = DurationPredictor(hidden_channels + - c_in_channels) + self.decoder = Decoder(out_channels, hidden_channels, decoder_type, decoder_params) + self.duration_predictor = DurationPredictor(hidden_channels + c_in_channels) if num_speakers > 1 and not external_c: # speaker embedding layer @@ -105,9 +96,7 @@ class SpeedySpeech(nn.Module): """ attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) attn = generate_path(dr, attn_mask.squeeze(1)).to(en.dtype) - o_en_ex = torch.matmul( - attn.squeeze(1).transpose(1, 2), en.transpose(1, - 2)).transpose(1, 2) + o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2), en.transpose(1, 2)).transpose(1, 2) return o_en_ex, attn def format_durations(self, o_dr_log, x_mask): @@ -141,8 +130,7 @@ class SpeedySpeech(nn.Module): x_emb = torch.transpose(x_emb, 1, -1) # compute sequence masks - x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), - 1).to(x.dtype) + x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.shape[1]), 1).to(x.dtype) # encoder pass o_en = self.encoder(x_emb, x_mask) @@ -155,8 +143,7 @@ class SpeedySpeech(nn.Module): return o_en, o_en_dp, x_mask, g def _forward_decoder(self, o_en, o_en_dp, dr, x_mask, y_lengths, g): - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), - 1).to(o_en_dp.dtype) + y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype) # expand o_en with durations o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask) # positional encoding @@ -169,15 +156,9 @@ class SpeedySpeech(nn.Module): o_de = self.decoder(o_en_ex, y_mask, g=g) return o_de, attn.transpose(1, 2) - def forward(self, - x, - x_lengths, - y_lengths, - dr, - cond_input={ - 'x_vectors': None, - 'speaker_ids': None - }): # pylint: disable=unused-argument + def forward( + self, x, x_lengths, y_lengths, dr, cond_input={"x_vectors": None, "speaker_ids": None} + ): # pylint: disable=unused-argument """ TODO: speaker embedding for speaker_ids Shapes: @@ -187,91 +168,68 @@ class SpeedySpeech(nn.Module): dr: [B, T_max] g: [B, C] """ - g = cond_input['x_vectors'] if 'x_vectors' in cond_input else None + g = cond_input["x_vectors"] if "x_vectors" in cond_input else None o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) - o_de, attn = self._forward_decoder(o_en, - o_en_dp, - dr, - x_mask, - y_lengths, - g=g) - outputs = { - 'model_outputs': o_de.transpose(1, 2), - 'durations_log': o_dr_log.squeeze(1), - 'alignments': attn - } + o_de, attn = self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g) + outputs = {"model_outputs": o_de.transpose(1, 2), "durations_log": o_dr_log.squeeze(1), "alignments": attn} return outputs - def inference(self, - x, - cond_input={ - 'x_vectors': None, - 'speaker_ids': None - }): # pylint: disable=unused-argument + def inference(self, x, cond_input={"x_vectors": None, "speaker_ids": None}): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] x_lengths: [B] g: [B, C] """ - g = cond_input['x_vectors'] if 'x_vectors' in cond_input else None + g = cond_input["x_vectors"] if "x_vectors" in cond_input else None x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # input sequence should be greated than the max convolution size inference_padding = 5 if x.shape[1] < 13: inference_padding += 13 - x.shape[1] # pad input to prevent dropping the last word - x = torch.nn.functional.pad(x, - pad=(0, inference_padding), - mode="constant", - value=0) + x = torch.nn.functional.pad(x, pad=(0, inference_padding), mode="constant", value=0) o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) # duration predictor pass o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1) y_lengths = o_dr.sum(1) - o_de, attn = self._forward_decoder(o_en, - o_en_dp, - o_dr, - x_mask, - y_lengths, - g=g) - outputs = { - 'model_outputs': o_de.transpose(1, 2), - 'alignments': attn, - 'durations_log': None - } + o_de, attn = self._forward_decoder(o_en, o_en_dp, o_dr, x_mask, y_lengths, g=g) + outputs = {"model_outputs": o_de.transpose(1, 2), "alignments": attn, "durations_log": None} return outputs def train_step(self, batch: dict, criterion: nn.Module): - text_input = batch['text_input'] - text_lengths = batch['text_lengths'] - mel_input = batch['mel_input'] - mel_lengths = batch['mel_lengths'] - x_vectors = batch['x_vectors'] - speaker_ids = batch['speaker_ids'] - durations = batch['durations'] + text_input = batch["text_input"] + text_lengths = batch["text_lengths"] + mel_input = batch["mel_input"] + mel_lengths = batch["mel_lengths"] + x_vectors = batch["x_vectors"] + speaker_ids = batch["speaker_ids"] + durations = batch["durations"] - cond_input = {'x_vectors': x_vectors, 'speaker_ids': speaker_ids} - outputs = self.forward(text_input, text_lengths, mel_lengths, - durations, cond_input) + cond_input = {"x_vectors": x_vectors, "speaker_ids": speaker_ids} + outputs = self.forward(text_input, text_lengths, mel_lengths, durations, cond_input) # compute loss - loss_dict = criterion(outputs['model_outputs'], mel_input, - mel_lengths, outputs['durations_log'], - torch.log(1 + durations), text_lengths) + loss_dict = criterion( + outputs["model_outputs"], + mel_input, + mel_lengths, + outputs["durations_log"], + torch.log(1 + durations), + text_lengths, + ) # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(outputs['alignments'], - binary=True) + align_error = 1 - alignment_diagonal_score(outputs["alignments"], binary=True) loss_dict["align_error"] = align_error return outputs, loss_dict def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): - model_outputs = outputs['model_outputs'] - alignments = outputs['alignments'] - mel_input = batch['mel_input'] + model_outputs = outputs["model_outputs"] + alignments = outputs["alignments"] + mel_input = batch["mel_input"] pred_spec = model_outputs[0].data.cpu().numpy() gt_spec = mel_input[0].data.cpu().numpy() @@ -293,7 +251,9 @@ class SpeedySpeech(nn.Module): def eval_log(self, ap: AudioProcessor, batch: dict, outputs: dict): return self.train_log(ap, batch, outputs) - def load_checkpoint(self, config, checkpoint_path, eval=False): # pylint: disable=unused-argument, redefined-builtin + def load_checkpoint( + self, config, checkpoint_path, eval=False + ): # pylint: disable=unused-argument, redefined-builtin state = torch.load(checkpoint_path, map_location=torch.device("cpu")) self.load_state_dict(state["model"]) if eval: diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 34f04159..19af28ff 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -50,6 +50,7 @@ class Tacotron(TacotronAbstract): gradual_trainin (List): Gradual training schedule. If None or `[]`, no gradual training is used. Defaults to `[]`. """ + def __init__( self, num_chars, @@ -78,7 +79,7 @@ class Tacotron(TacotronAbstract): use_gst=False, gst=None, memory_size=5, - gradual_training=[] + gradual_training=[], ): super().__init__( num_chars, @@ -106,15 +107,14 @@ class Tacotron(TacotronAbstract): speaker_embedding_dim, use_gst, gst, - gradual_training + gradual_training, ) # speaker embedding layers if self.num_speakers > 1: if not self.embeddings_per_sample: speaker_embedding_dim = 256 - self.speaker_embedding = nn.Embedding(self.num_speakers, - speaker_embedding_dim) + self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) # speaker and gst embeddings is concat in decoder input @@ -145,8 +145,7 @@ class Tacotron(TacotronAbstract): separate_stopnet, ) self.postnet = PostCBHG(decoder_output_dim) - self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, - postnet_output_dim) + self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, postnet_output_dim) # setup prenet dropout self.decoder.prenet.dropout_at_inference = prenet_dropout_at_inference @@ -183,12 +182,7 @@ class Tacotron(TacotronAbstract): separate_stopnet, ) - def forward(self, - text, - text_lengths, - mel_specs=None, - mel_lengths=None, - cond_input=None): + def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, cond_input=None): """ Shapes: text: [B, T_in] @@ -197,100 +191,87 @@ class Tacotron(TacotronAbstract): mel_lengths: [B] cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] """ - outputs = { - 'alignments_backward': None, - 'decoder_outputs_backward': None - } + outputs = {"alignments_backward": None, "decoder_outputs_backward": None} input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) # B x T_in x embed_dim inputs = self.embedding(text) # B x T_in x encoder_in_features encoder_outputs = self.encoder(inputs) # sequence masking - encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as( - encoder_outputs) + encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) # global style token if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, - cond_input['x_vectors']) + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["x_vectors"]) # speaker embedding if self.num_speakers > 1: if not self.embeddings_per_sample: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(cond_input['speaker_ids'])[:, - None] + speaker_embeddings = self.speaker_embedding(cond_input["speaker_ids"])[:, None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(cond_input['x_vectors'], 1) - encoder_outputs = self._concat_speaker_embedding( - encoder_outputs, speaker_embeddings) + speaker_embeddings = torch.unsqueeze(cond_input["x_vectors"], 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) # decoder_outputs: B x decoder_in_features x T_out # alignments: B x T_in x encoder_in_features # stop_tokens: B x T_in - decoder_outputs, alignments, stop_tokens = self.decoder( - encoder_outputs, mel_specs, input_mask) + decoder_outputs, alignments, stop_tokens = self.decoder(encoder_outputs, mel_specs, input_mask) # sequence masking if output_mask is not None: - decoder_outputs = decoder_outputs * output_mask.unsqueeze( - 1).expand_as(decoder_outputs) + decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs) # B x T_out x decoder_in_features postnet_outputs = self.postnet(decoder_outputs) # sequence masking if output_mask is not None: - postnet_outputs = postnet_outputs * output_mask.unsqueeze( - 2).expand_as(postnet_outputs) + postnet_outputs = postnet_outputs * output_mask.unsqueeze(2).expand_as(postnet_outputs) # B x T_out x posnet_dim postnet_outputs = self.last_linear(postnet_outputs) # B x T_out x decoder_in_features decoder_outputs = decoder_outputs.transpose(1, 2).contiguous() if self.bidirectional_decoder: - decoder_outputs_backward, alignments_backward = self._backward_pass( - mel_specs, encoder_outputs, input_mask) - outputs['alignments_backward'] = alignments_backward - outputs['decoder_outputs_backward'] = decoder_outputs_backward + decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask) + outputs["alignments_backward"] = alignments_backward + outputs["decoder_outputs_backward"] = decoder_outputs_backward if self.double_decoder_consistency: decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass( - mel_specs, encoder_outputs, alignments, input_mask) - outputs['alignments_backward'] = alignments_backward - outputs['decoder_outputs_backward'] = decoder_outputs_backward - outputs.update({ - 'model_outputs': postnet_outputs, - 'decoder_outputs': decoder_outputs, - 'alignments': alignments, - 'stop_tokens': stop_tokens - }) + mel_specs, encoder_outputs, alignments, input_mask + ) + outputs["alignments_backward"] = alignments_backward + outputs["decoder_outputs_backward"] = decoder_outputs_backward + outputs.update( + { + "model_outputs": postnet_outputs, + "decoder_outputs": decoder_outputs, + "alignments": alignments, + "stop_tokens": stop_tokens, + } + ) return outputs @torch.no_grad() - def inference(self, - text_input, - cond_input=None): + def inference(self, text_input, cond_input=None): inputs = self.embedding(text_input) encoder_outputs = self.encoder(inputs) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, cond_input['style_mel'], - cond_input['x_vectors']) + encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["x_vectors"]) if self.num_speakers > 1: if not self.embeddings_per_sample: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(cond_input['speaker_ids'])[:, None] + speaker_embeddings = self.speaker_embedding(cond_input["speaker_ids"])[:, None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(cond_input['x_vectors'], 1) - encoder_outputs = self._concat_speaker_embedding( - encoder_outputs, speaker_embeddings) - decoder_outputs, alignments, stop_tokens = self.decoder.inference( - encoder_outputs) + speaker_embeddings = torch.unsqueeze(cond_input["x_vectors"], 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = self.last_linear(postnet_outputs) decoder_outputs = decoder_outputs.transpose(1, 2) outputs = { - 'model_outputs': postnet_outputs, - 'decoder_outputs': decoder_outputs, - 'alignments': alignments, - 'stop_tokens': stop_tokens + "model_outputs": postnet_outputs, + "decoder_outputs": decoder_outputs, + "alignments": alignments, + "stop_tokens": stop_tokens, } return outputs @@ -301,64 +282,61 @@ class Tacotron(TacotronAbstract): batch ([type]): [description] criterion ([type]): [description] """ - text_input = batch['text_input'] - text_lengths = batch['text_lengths'] - mel_input = batch['mel_input'] - mel_lengths = batch['mel_lengths'] - linear_input = batch['linear_input'] - stop_targets = batch['stop_targets'] - speaker_ids = batch['speaker_ids'] - x_vectors = batch['x_vectors'] + text_input = batch["text_input"] + text_lengths = batch["text_lengths"] + mel_input = batch["mel_input"] + mel_lengths = batch["mel_lengths"] + linear_input = batch["linear_input"] + stop_targets = batch["stop_targets"] + speaker_ids = batch["speaker_ids"] + x_vectors = batch["x_vectors"] # forward pass model - outputs = self.forward(text_input, - text_lengths, - mel_input, - mel_lengths, - cond_input={ - 'speaker_ids': speaker_ids, - 'x_vectors': x_vectors - }) + outputs = self.forward( + text_input, + text_lengths, + mel_input, + mel_lengths, + cond_input={"speaker_ids": speaker_ids, "x_vectors": x_vectors}, + ) # set the [alignment] lengths wrt reduction factor for guided attention if mel_lengths.max() % self.decoder.r != 0: alignment_lengths = ( - mel_lengths + - (self.decoder.r - - (mel_lengths.max() % self.decoder.r))) // self.decoder.r + mel_lengths + (self.decoder.r - (mel_lengths.max() % self.decoder.r)) + ) // self.decoder.r else: alignment_lengths = mel_lengths // self.decoder.r - cond_input = {'speaker_ids': speaker_ids, 'x_vectors': x_vectors} - outputs = self.forward(text_input, text_lengths, mel_input, - mel_lengths, cond_input) + cond_input = {"speaker_ids": speaker_ids, "x_vectors": x_vectors} + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input) # compute loss loss_dict = criterion( - outputs['model_outputs'], - outputs['decoder_outputs'], + outputs["model_outputs"], + outputs["decoder_outputs"], mel_input, linear_input, - outputs['stop_tokens'], + outputs["stop_tokens"], stop_targets, mel_lengths, - outputs['decoder_outputs_backward'], - outputs['alignments'], + outputs["decoder_outputs_backward"], + outputs["alignments"], alignment_lengths, - outputs['alignments_backward'], + outputs["alignments_backward"], text_lengths, ) # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(outputs['alignments']) + align_error = 1 - alignment_diagonal_score(outputs["alignments"]) loss_dict["align_error"] = align_error return outputs, loss_dict def train_log(self, ap, batch, outputs): - postnet_outputs = outputs['model_outputs'] - alignments = outputs['alignments'] - alignments_backward = outputs['alignments_backward'] - mel_input = batch['mel_input'] + postnet_outputs = outputs["model_outputs"] + alignments = outputs["alignments"] + alignments_backward = outputs["alignments_backward"] + mel_input = batch["mel_input"] pred_spec = postnet_outputs[0].data.cpu().numpy() gt_spec = mel_input[0].data.cpu().numpy() @@ -371,8 +349,7 @@ class Tacotron(TacotronAbstract): } if self.bidirectional_decoder or self.double_decoder_consistency: - figures["alignment_backward"] = plot_alignment( - alignments_backward[0].data.cpu().numpy(), output_fig=False) + figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy(), output_fig=False) # Sample audio train_audio = ap.inv_spectrogram(pred_spec.T) @@ -382,4 +359,4 @@ class Tacotron(TacotronAbstract): return self.train_step(batch, criterion) def eval_log(self, ap, batch, outputs): - return self.train_log(ap, batch, outputs) \ No newline at end of file + return self.train_log(ap, batch, outputs) diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 04b97606..4e111fda 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -49,49 +49,70 @@ class Tacotron2(TacotronAbstract): gradual_trainin (List): Gradual training schedule. If None or `[]`, no gradual training is used. Defaults to `[]`. """ - def __init__(self, - num_chars, - num_speakers, - r, - postnet_output_dim=80, - decoder_output_dim=80, - attn_type="original", - attn_win=False, - attn_norm="softmax", - prenet_type="original", - prenet_dropout=True, - prenet_dropout_at_inference=False, - forward_attn=False, - trans_agent=False, - forward_attn_mask=False, - location_attn=True, - attn_K=5, - separate_stopnet=True, - bidirectional_decoder=False, - double_decoder_consistency=False, - ddc_r=None, - encoder_in_features=512, - decoder_in_features=512, - speaker_embedding_dim=None, - use_gst=False, - gst=None, - gradual_training=[]): - super().__init__(num_chars, num_speakers, r, postnet_output_dim, - decoder_output_dim, attn_type, attn_win, attn_norm, - prenet_type, prenet_dropout, - prenet_dropout_at_inference, forward_attn, - trans_agent, forward_attn_mask, location_attn, attn_K, - separate_stopnet, bidirectional_decoder, - double_decoder_consistency, ddc_r, - encoder_in_features, decoder_in_features, - speaker_embedding_dim, use_gst, gst, gradual_training) + + def __init__( + self, + num_chars, + num_speakers, + r, + postnet_output_dim=80, + decoder_output_dim=80, + attn_type="original", + attn_win=False, + attn_norm="softmax", + prenet_type="original", + prenet_dropout=True, + prenet_dropout_at_inference=False, + forward_attn=False, + trans_agent=False, + forward_attn_mask=False, + location_attn=True, + attn_K=5, + separate_stopnet=True, + bidirectional_decoder=False, + double_decoder_consistency=False, + ddc_r=None, + encoder_in_features=512, + decoder_in_features=512, + speaker_embedding_dim=None, + use_gst=False, + gst=None, + gradual_training=[], + ): + super().__init__( + num_chars, + num_speakers, + r, + postnet_output_dim, + decoder_output_dim, + attn_type, + attn_win, + attn_norm, + prenet_type, + prenet_dropout, + prenet_dropout_at_inference, + forward_attn, + trans_agent, + forward_attn_mask, + location_attn, + attn_K, + separate_stopnet, + bidirectional_decoder, + double_decoder_consistency, + ddc_r, + encoder_in_features, + decoder_in_features, + speaker_embedding_dim, + use_gst, + gst, + gradual_training, + ) # speaker embedding layer if self.num_speakers > 1: if not self.embeddings_per_sample: speaker_embedding_dim = 512 - self.speaker_embedding = nn.Embedding(self.num_speakers, - speaker_embedding_dim) + self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) # speaker and gst embeddings is concat in decoder input @@ -162,12 +183,7 @@ class Tacotron2(TacotronAbstract): mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2) return mel_outputs, mel_outputs_postnet, alignments - def forward(self, - text, - text_lengths, - mel_specs=None, - mel_lengths=None, - cond_input=None): + def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, cond_input=None): """ Shapes: text: [B, T_in] @@ -176,10 +192,7 @@ class Tacotron2(TacotronAbstract): mel_lengths: [B] cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] """ - outputs = { - 'alignments_backward': None, - 'decoder_outputs_backward': None - } + outputs = {"alignments_backward": None, "decoder_outputs_backward": None} # compute mask for padding # B x T_in_max (boolean) input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) @@ -189,55 +202,49 @@ class Tacotron2(TacotronAbstract): encoder_outputs = self.encoder(embedded_inputs, text_lengths) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, - cond_input['x_vectors']) + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["x_vectors"]) if self.num_speakers > 1: if not self.embeddings_per_sample: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(cond_input['speaker_ids'])[:, - None] + speaker_embeddings = self.speaker_embedding(cond_input["speaker_ids"])[:, None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(cond_input['x_vectors'], 1) - encoder_outputs = self._concat_speaker_embedding( - encoder_outputs, speaker_embeddings) + speaker_embeddings = torch.unsqueeze(cond_input["x_vectors"], 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) - encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as( - encoder_outputs) + encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) # B x mel_dim x T_out -- B x T_out//r x T_in -- B x T_out//r - decoder_outputs, alignments, stop_tokens = self.decoder( - encoder_outputs, mel_specs, input_mask) + decoder_outputs, alignments, stop_tokens = self.decoder(encoder_outputs, mel_specs, input_mask) # sequence masking if mel_lengths is not None: - decoder_outputs = decoder_outputs * output_mask.unsqueeze( - 1).expand_as(decoder_outputs) + decoder_outputs = decoder_outputs * output_mask.unsqueeze(1).expand_as(decoder_outputs) # B x mel_dim x T_out postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = decoder_outputs + postnet_outputs # sequence masking if output_mask is not None: - postnet_outputs = postnet_outputs * output_mask.unsqueeze( - 1).expand_as(postnet_outputs) + postnet_outputs = postnet_outputs * output_mask.unsqueeze(1).expand_as(postnet_outputs) # B x T_out x mel_dim -- B x T_out x mel_dim -- B x T_out//r x T_in - decoder_outputs, postnet_outputs, alignments = self.shape_outputs( - decoder_outputs, postnet_outputs, alignments) + decoder_outputs, postnet_outputs, alignments = self.shape_outputs(decoder_outputs, postnet_outputs, alignments) if self.bidirectional_decoder: - decoder_outputs_backward, alignments_backward = self._backward_pass( - mel_specs, encoder_outputs, input_mask) - outputs['alignments_backward'] = alignments_backward - outputs['decoder_outputs_backward'] = decoder_outputs_backward + decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask) + outputs["alignments_backward"] = alignments_backward + outputs["decoder_outputs_backward"] = decoder_outputs_backward if self.double_decoder_consistency: decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass( - mel_specs, encoder_outputs, alignments, input_mask) - outputs['alignments_backward'] = alignments_backward - outputs['decoder_outputs_backward'] = decoder_outputs_backward - outputs.update({ - 'model_outputs': postnet_outputs, - 'decoder_outputs': decoder_outputs, - 'alignments': alignments, - 'stop_tokens': stop_tokens - }) + mel_specs, encoder_outputs, alignments, input_mask + ) + outputs["alignments_backward"] = alignments_backward + outputs["decoder_outputs_backward"] = decoder_outputs_backward + outputs.update( + { + "model_outputs": postnet_outputs, + "decoder_outputs": decoder_outputs, + "alignments": alignments, + "stop_tokens": stop_tokens, + } + ) return outputs @torch.no_grad() @@ -247,29 +254,25 @@ class Tacotron2(TacotronAbstract): if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, cond_input['style_mel'], - cond_input['x_vectors']) + encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["x_vectors"]) if self.num_speakers > 1: if not self.embeddings_per_sample: x_vector = self.speaker_embedding(cond_input['speaker_ids'])[:, None] x_vector = torch.unsqueeze(x_vector, 0).transpose(1, 2) else: - x_vector = cond_input['x_vectors'] + x_vector = cond_input["x_vectors"] - encoder_outputs = self._concat_speaker_embedding( - encoder_outputs, x_vector) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, x_vector) - decoder_outputs, alignments, stop_tokens = self.decoder.inference( - encoder_outputs) + decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = decoder_outputs + postnet_outputs - decoder_outputs, postnet_outputs, alignments = self.shape_outputs( - decoder_outputs, postnet_outputs, alignments) + decoder_outputs, postnet_outputs, alignments = self.shape_outputs(decoder_outputs, postnet_outputs, alignments) outputs = { - 'model_outputs': postnet_outputs, - 'decoder_outputs': decoder_outputs, - 'alignments': alignments, - 'stop_tokens': stop_tokens + "model_outputs": postnet_outputs, + "decoder_outputs": decoder_outputs, + "alignments": alignments, + "stop_tokens": stop_tokens, } return outputs @@ -280,64 +283,61 @@ class Tacotron2(TacotronAbstract): batch ([type]): [description] criterion ([type]): [description] """ - text_input = batch['text_input'] - text_lengths = batch['text_lengths'] - mel_input = batch['mel_input'] - mel_lengths = batch['mel_lengths'] - linear_input = batch['linear_input'] - stop_targets = batch['stop_targets'] - speaker_ids = batch['speaker_ids'] - x_vectors = batch['x_vectors'] + text_input = batch["text_input"] + text_lengths = batch["text_lengths"] + mel_input = batch["mel_input"] + mel_lengths = batch["mel_lengths"] + linear_input = batch["linear_input"] + stop_targets = batch["stop_targets"] + speaker_ids = batch["speaker_ids"] + x_vectors = batch["x_vectors"] # forward pass model - outputs = self.forward(text_input, - text_lengths, - mel_input, - mel_lengths, - cond_input={ - 'speaker_ids': speaker_ids, - 'x_vectors': x_vectors - }) + outputs = self.forward( + text_input, + text_lengths, + mel_input, + mel_lengths, + cond_input={"speaker_ids": speaker_ids, "x_vectors": x_vectors}, + ) # set the [alignment] lengths wrt reduction factor for guided attention if mel_lengths.max() % self.decoder.r != 0: alignment_lengths = ( - mel_lengths + - (self.decoder.r - - (mel_lengths.max() % self.decoder.r))) // self.decoder.r + mel_lengths + (self.decoder.r - (mel_lengths.max() % self.decoder.r)) + ) // self.decoder.r else: alignment_lengths = mel_lengths // self.decoder.r - cond_input = {'speaker_ids': speaker_ids, 'x_vectors': x_vectors} - outputs = self.forward(text_input, text_lengths, mel_input, - mel_lengths, cond_input) + cond_input = {"speaker_ids": speaker_ids, "x_vectors": x_vectors} + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input) # compute loss loss_dict = criterion( - outputs['model_outputs'], - outputs['decoder_outputs'], + outputs["model_outputs"], + outputs["decoder_outputs"], mel_input, linear_input, - outputs['stop_tokens'], + outputs["stop_tokens"], stop_targets, mel_lengths, - outputs['decoder_outputs_backward'], - outputs['alignments'], + outputs["decoder_outputs_backward"], + outputs["alignments"], alignment_lengths, - outputs['alignments_backward'], + outputs["alignments_backward"], text_lengths, ) # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(outputs['alignments']) + align_error = 1 - alignment_diagonal_score(outputs["alignments"]) loss_dict["align_error"] = align_error return outputs, loss_dict def train_log(self, ap, batch, outputs): - postnet_outputs = outputs['model_outputs'] - alignments = outputs['alignments'] - alignments_backward = outputs['alignments_backward'] - mel_input = batch['mel_input'] + postnet_outputs = outputs["model_outputs"] + alignments = outputs["alignments"] + alignments_backward = outputs["alignments_backward"] + mel_input = batch["mel_input"] pred_spec = postnet_outputs[0].data.cpu().numpy() gt_spec = mel_input[0].data.cpu().numpy() @@ -350,8 +350,7 @@ class Tacotron2(TacotronAbstract): } if self.bidirectional_decoder or self.double_decoder_consistency: - figures["alignment_backward"] = plot_alignment( - alignments_backward[0].data.cpu().numpy(), output_fig=False) + figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy(), output_fig=False) # Sample audio train_audio = ap.inv_melspectrogram(pred_spec.T) diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py index 2bea06a9..49487b67 100644 --- a/TTS/tts/models/tacotron_abstract.py +++ b/TTS/tts/models/tacotron_abstract.py @@ -37,7 +37,7 @@ class TacotronAbstract(ABC, nn.Module): speaker_embedding_dim=None, use_gst=False, gst=None, - gradual_training=[] + gradual_training=[], ): """Abstract Tacotron class""" super().__init__() diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index d58886e9..d27b9eb0 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -59,25 +59,16 @@ def numpy_to_tf(np_array, dtype): def compute_style_mel(style_wav, ap, cuda=False): - style_mel = torch.FloatTensor( - ap.melspectrogram(ap.load_wav(style_wav, - sr=ap.sample_rate))).unsqueeze(0) + style_mel = torch.FloatTensor(ap.melspectrogram(ap.load_wav(style_wav, sr=ap.sample_rate))).unsqueeze(0) if cuda: return style_mel.cuda() return style_mel -def run_model_torch(model, - inputs, - speaker_id=None, - style_mel=None, - x_vector=None): - outputs = model.inference(inputs, - cond_input={ - 'speaker_ids': speaker_id, - 'x_vector': x_vector, - 'style_mel': style_mel - }) +def run_model_torch(model, inputs, speaker_id=None, style_mel=None, x_vector=None): + outputs = model.inference( + inputs, cond_input={"speaker_ids": speaker_id, "x_vector": x_vector, "style_mel": style_mel} + ) return outputs @@ -87,18 +78,15 @@ def run_model_tf(model, inputs, CONFIG, speaker_id=None, style_mel=None): if speaker_id is not None: raise NotImplementedError(" [!] Multi-Speaker not implemented for TF") # TODO: handle multispeaker case - decoder_output, postnet_output, alignments, stop_tokens = model( - inputs, training=False) + decoder_output, postnet_output, alignments, stop_tokens = model(inputs, training=False) return decoder_output, postnet_output, alignments, stop_tokens def run_model_tflite(model, inputs, CONFIG, speaker_id=None, style_mel=None): if CONFIG.gst and style_mel is not None: - raise NotImplementedError( - " [!] GST inference not implemented for TfLite") + raise NotImplementedError(" [!] GST inference not implemented for TfLite") if speaker_id is not None: - raise NotImplementedError( - " [!] Multi-Speaker not implemented for TfLite") + raise NotImplementedError(" [!] Multi-Speaker not implemented for TfLite") # get input and output details input_details = model.get_input_details() output_details = model.get_output_details() @@ -132,7 +120,7 @@ def parse_outputs_tflite(postnet_output, decoder_output): def trim_silence(wav, ap): - return wav[:ap.find_endpoint(wav)] + return wav[: ap.find_endpoint(wav)] def inv_spectrogram(postnet_output, ap, CONFIG): @@ -155,8 +143,7 @@ def speaker_id_to_torch(speaker_id, cuda=False): def embedding_to_torch(x_vector, cuda=False): if x_vector is not None: x_vector = np.asarray(x_vector) - x_vector = torch.from_numpy(x_vector).unsqueeze(0).type( - torch.FloatTensor) + x_vector = torch.from_numpy(x_vector).unsqueeze(0).type(torch.FloatTensor) if cuda: return x_vector.cuda() return x_vector @@ -173,8 +160,7 @@ def apply_griffin_lim(inputs, input_lens, CONFIG, ap): """ wavs = [] for idx, spec in enumerate(inputs): - wav_len = (input_lens[idx] * - ap.hop_length) - ap.hop_length # inverse librosa padding + wav_len = (input_lens[idx] * ap.hop_length) - ap.hop_length # inverse librosa padding wav = inv_spectrogram(spec, ap, CONFIG) # assert len(wav) == wav_len, f" [!] wav lenght: {len(wav)} vs expected: {wav_len}" wavs.append(wav[:wav_len]) @@ -242,23 +228,21 @@ def synthesis( text_inputs = tf.expand_dims(text_inputs, 0) # synthesize voice if backend == "torch": - outputs = run_model_torch(model, - text_inputs, - speaker_id, - style_mel, - x_vector=x_vector) - model_outputs = outputs['model_outputs'] + outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, x_vector=x_vector) + model_outputs = outputs["model_outputs"] model_outputs = model_outputs[0].data.cpu().numpy() elif backend == "tf": decoder_output, postnet_output, alignments, stop_tokens = run_model_tf( - model, text_inputs, CONFIG, speaker_id, style_mel) + model, text_inputs, CONFIG, speaker_id, style_mel + ) model_outputs, decoder_output, alignment, stop_tokens = parse_outputs_tf( - postnet_output, decoder_output, alignments, stop_tokens) + postnet_output, decoder_output, alignments, stop_tokens + ) elif backend == "tflite": decoder_output, postnet_output, alignment, stop_tokens = run_model_tflite( - model, text_inputs, CONFIG, speaker_id, style_mel) - model_outputs, decoder_output = parse_outputs_tflite( - postnet_output, decoder_output) + model, text_inputs, CONFIG, speaker_id, style_mel + ) + model_outputs, decoder_output = parse_outputs_tflite(postnet_output, decoder_output) # convert outputs to numpy # plot results wav = None @@ -268,9 +252,9 @@ def synthesis( if do_trim_silence: wav = trim_silence(wav, ap) return_dict = { - 'wav': wav, - 'alignments': outputs['alignments'], - 'model_outputs': model_outputs, - 'text_inputs': text_inputs + "wav": wav, + "alignments": outputs["alignments"], + "model_outputs": model_outputs, + "text_inputs": text_inputs, } return return_dict diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 3fc63e26..5e6acd1d 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -30,16 +30,16 @@ def init_arguments(argv): parser.add_argument( "--continue_path", type=str, - help=("Training output folder to continue training. Used to continue " - "a training. If it is used, 'config_path' is ignored."), + help=( + "Training output folder to continue training. Used to continue " + "a training. If it is used, 'config_path' is ignored." + ), default="", required="--config_path" not in argv, ) parser.add_argument( - "--restore_path", - type=str, - help="Model file to be restored. Use to finetune a model.", - default="") + "--restore_path", type=str, help="Model file to be restored. Use to finetune a model.", default="" + ) parser.add_argument( "--best_path", type=str, @@ -49,23 +49,12 @@ def init_arguments(argv): ), default="", ) - parser.add_argument("--config_path", - type=str, - help="Path to config file for training.", - required="--continue_path" not in argv) - parser.add_argument("--debug", - type=bool, - default=False, - help="Do not verify commit integrity to run training.") parser.add_argument( - "--rank", - type=int, - default=0, - help="DISTRIBUTED: process rank for distributed training.") - parser.add_argument("--group_id", - type=str, - default="", - help="DISTRIBUTED: process group id.") + "--config_path", type=str, help="Path to config file for training.", required="--continue_path" not in argv + ) + parser.add_argument("--debug", type=bool, default=False, help="Do not verify commit integrity to run training.") + parser.add_argument("--rank", type=int, default=0, help="DISTRIBUTED: process rank for distributed training.") + parser.add_argument("--group_id", type=str, default="", help="DISTRIBUTED: process group id.") return parser @@ -160,8 +149,7 @@ def process_args(args): print(" > Mixed precision mode is ON") experiment_path = args.continue_path if not experiment_path: - experiment_path = create_experiment_folder(config.output_path, - config.run_name, args.debug) + experiment_path = create_experiment_folder(config.output_path, config.run_name, args.debug) audio_path = os.path.join(experiment_path, "test_audios") # setup rank 0 process in distributed training tb_logger = None @@ -182,8 +170,7 @@ def process_args(args): os.chmod(experiment_path, 0o775) tb_logger = TensorboardLogger(experiment_path, model_name=config.model) # write model desc to tensorboard - tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", - 0) + tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) c_logger = ConsoleLogger() return config, experiment_path, audio_path, c_logger, tb_logger diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index a8332eb8..a31436d4 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -234,8 +234,8 @@ class Synthesizer(object): use_griffin_lim=use_gl, x_vector=speaker_embedding, ) - waveform = outputs['wav'] - mel_postnet_spec = outputs['model_outputs'] + waveform = outputs["wav"] + mel_postnet_spec = outputs["model_outputs"] if not use_gl: # denormalize tts output based on tts audio config mel_postnet_spec = self.ap.denormalize(mel_postnet_spec.T).T diff --git a/tests/tts_tests/test_align_tts_train.py b/tests/tts_tests/test_align_tts_train.py index 4bf3802f..3d802d5f 100644 --- a/tests/tts_tests/test_align_tts_train.py +++ b/tests/tts_tests/test_align_tts_train.py @@ -44,8 +44,6 @@ run_cli(command_train) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py index 0ae25701..7e6c069c 100644 --- a/tests/tts_tests/test_glow_tts_train.py +++ b/tests/tts_tests/test_glow_tts_train.py @@ -46,8 +46,6 @@ run_cli(command_train) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index c8716fb0..6be3da97 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -45,8 +45,6 @@ run_cli(command_train) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py index aef507a5..0d9a67a5 100644 --- a/tests/tts_tests/test_tacotron2_train.py +++ b/tests/tts_tests/test_tacotron2_train.py @@ -45,8 +45,6 @@ run_cli(command_train) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_tacotron_train.py b/tests/tts_tests/test_tacotron_train.py index 771ad93c..52560715 100644 --- a/tests/tts_tests/test_tacotron_train.py +++ b/tests/tts_tests/test_tacotron_train.py @@ -44,8 +44,6 @@ run_cli(command_train) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py index e3004db7..3ff65b5a 100644 --- a/tests/vocoder_tests/test_melgan_train.py +++ b/tests/vocoder_tests/test_melgan_train.py @@ -21,7 +21,6 @@ config = MelganConfig( print_step=1, discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, print_eval=True, - discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, data_path="tests/data/ljspeech", output_path=output_path, ) From 0eec238429d5ace3a405abeed2211984997749a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 May 2021 13:32:08 +0200 Subject: [PATCH 151/258] remove redundant imports --- TTS/trainer.py | 5 ----- TTS/utils/{ => logging}/console_logger.py | 0 TTS/utils/{ => logging}/tensorboard_logger.py | 0 3 files changed, 5 deletions(-) rename TTS/utils/{ => logging}/console_logger.py (100%) rename TTS/utils/{ => logging}/tensorboard_logger.py (100%) diff --git a/TTS/trainer.py b/TTS/trainer.py index 06d5d6b5..7a31bb34 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -3,13 +3,8 @@ import importlib import logging import os -import sys import time -import traceback -from logging import StreamHandler -from random import randrange -import numpy as np import torch # DISTRIBUTED diff --git a/TTS/utils/console_logger.py b/TTS/utils/logging/console_logger.py similarity index 100% rename from TTS/utils/console_logger.py rename to TTS/utils/logging/console_logger.py diff --git a/TTS/utils/tensorboard_logger.py b/TTS/utils/logging/tensorboard_logger.py similarity index 100% rename from TTS/utils/tensorboard_logger.py rename to TTS/utils/logging/tensorboard_logger.py From 72dceca52cf312dd497d3e9ecbd758e8e27d1a0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 May 2021 13:32:35 +0200 Subject: [PATCH 152/258] import missings --- TTS/trainer.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index 7a31bb34..372bb0f6 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -7,23 +7,29 @@ import time import torch +from coqpit import Coqpit +from dataclasses import dataclass, field +from typing import Tuple, Dict, List, Union + +from argparse import Namespace # DISTRIBUTED +from torch import nn from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler +from TTS.utils.logging import ConsoleLogger, TensorboardLogger from TTS.tts.datasets import TTSDataset, load_meta_data from TTS.tts.layers import setup_loss from TTS.tts.models import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.synthesis import synthesis -from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols +from TTS.tts.utils.text.symbols import make_symbols from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.arguments import init_training from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import init_distributed, reduce_tensor -from TTS.utils.generic_utils import KeepAverage, count_parameters, find_module, remove_experiment_folder, set_init_dict +from TTS.utils.distribute import init_distributed +from TTS.utils.generic_utils import KeepAverage, count_parameters, set_init_dict from TTS.utils.training import check_update, setup_torch_training_env From 0cee5042a937383fc53258cf26ae9912a42a13ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 May 2021 13:34:15 +0200 Subject: [PATCH 153/258] fix logger imports --- TTS/utils/arguments.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 5e6acd1d..90abd3b5 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -11,10 +11,9 @@ import torch from TTS.config import load_config from TTS.tts.utils.text.symbols import parse_symbols -from TTS.utils.console_logger import ConsoleLogger +from TTS.utils.logging import ConsoleLogger, TensorboardLogger from TTS.utils.generic_utils import create_experiment_folder, get_git_branch from TTS.utils.io import copy_model_files -from TTS.utils.tensorboard_logger import TensorboardLogger def init_arguments(argv): From b643e8b37cbf8f14724fe6e68bac6221dfdfef73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 May 2021 13:35:54 +0200 Subject: [PATCH 154/258] `logging/__init__.py` --- TTS/utils/logging/__init__.py | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 TTS/utils/logging/__init__.py diff --git a/TTS/utils/logging/__init__.py b/TTS/utils/logging/__init__.py new file mode 100644 index 00000000..877131c4 --- /dev/null +++ b/TTS/utils/logging/__init__.py @@ -0,0 +1,2 @@ +from TTS.utils.logging.console_logger import ConsoleLogger +from TTS.utils.logging.tensorboard_logger import TensorboardLogger From d96ebcd6d3eeace2e62794d602a64555f8ff3344 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 May 2021 13:37:08 +0200 Subject: [PATCH 155/258] make style --- TTS/bin/convert_tacotron2_torch_to_tf.py | 2 +- TTS/bin/train_tts.py | 3 +- TTS/trainer.py | 218 +++++++++++------------ TTS/tts/datasets/__init__.py | 6 +- TTS/tts/datasets/formatters.py | 1 - TTS/tts/models/align_tts.py | 6 +- TTS/tts/models/glow_tts.py | 4 +- TTS/tts/models/speedy_speech.py | 6 +- TTS/tts/models/tacotron.py | 4 +- TTS/tts/models/tacotron2.py | 4 +- TTS/tts/utils/data.py | 2 +- TTS/tts/utils/speakers.py | 2 +- TTS/utils/arguments.py | 2 +- 13 files changed, 130 insertions(+), 130 deletions(-) diff --git a/TTS/bin/convert_tacotron2_torch_to_tf.py b/TTS/bin/convert_tacotron2_torch_to_tf.py index e7f991be..119529ae 100644 --- a/TTS/bin/convert_tacotron2_torch_to_tf.py +++ b/TTS/bin/convert_tacotron2_torch_to_tf.py @@ -8,10 +8,10 @@ import numpy as np import tensorflow as tf import torch +from TTS.tts.models import setup_model from TTS.tts.tf.models.tacotron2 import Tacotron2 from TTS.tts.tf.utils.convert_torch_to_tf_utils import compare_torch_tf, convert_tf_name, transfer_weights_torch_to_tf from TTS.tts.tf.utils.generic_utils import save_checkpoint -from TTS.tts.models import setup_model from TTS.tts.utils.text.symbols import phonemes, symbols from TTS.utils.io import load_config diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 607a4e3b..8182b23f 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -1,9 +1,10 @@ import os import sys import traceback + +from TTS.trainer import TrainerTTS from TTS.utils.arguments import init_training from TTS.utils.generic_utils import remove_experiment_folder -from TTS.trainer import TrainerTTS def main(): diff --git a/TTS/trainer.py b/TTS/trainer.py index 372bb0f6..cb905d3a 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -4,21 +4,19 @@ import importlib import logging import os import time +from argparse import Namespace +from dataclasses import dataclass, field +from typing import Dict, List, Tuple, Union import torch - from coqpit import Coqpit -from dataclasses import dataclass, field -from typing import Tuple, Dict, List, Union -from argparse import Namespace # DISTRIBUTED from torch import nn from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler -from TTS.utils.logging import ConsoleLogger, TensorboardLogger from TTS.tts.datasets import TTSDataset, load_meta_data from TTS.tts.layers import setup_loss from TTS.tts.models import setup_model @@ -30,49 +28,48 @@ from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor from TTS.utils.distribute import init_distributed from TTS.utils.generic_utils import KeepAverage, count_parameters, set_init_dict +from TTS.utils.logging import ConsoleLogger, TensorboardLogger from TTS.utils.training import check_update, setup_torch_training_env @dataclass class TrainingArgs(Coqpit): continue_path: str = field( - default='', + default="", metadata={ - 'help': - 'Path to a training folder to continue training. Restore the model from the last checkpoint and continue training under the same folder.' - }) + "help": "Path to a training folder to continue training. Restore the model from the last checkpoint and continue training under the same folder." + }, + ) restore_path: str = field( - default='', + default="", metadata={ - 'help': - 'Path to a model checkpoit. Restore the model with the given checkpoint and start a new training.' - }) + "help": "Path to a model checkpoit. Restore the model with the given checkpoint and start a new training." + }, + ) best_path: str = field( - default='', + default="", metadata={ - 'help': - "Best model file to be used for extracting best loss. If not specified, the latest best model in continue path is used" - }) - config_path: str = field( - default='', metadata={'help': 'Path to the configuration file.'}) - rank: int = field( - default=0, metadata={'help': 'Process rank in distributed training.'}) - group_id: str = field( - default='', - metadata={'help': 'Process group id in distributed training.'}) + "help": "Best model file to be used for extracting best loss. If not specified, the latest best model in continue path is used" + }, + ) + config_path: str = field(default="", metadata={"help": "Path to the configuration file."}) + rank: int = field(default=0, metadata={"help": "Process rank in distributed training."}) + group_id: str = field(default="", metadata={"help": "Process group id in distributed training."}) # pylint: disable=import-outside-toplevel, too-many-public-methods class TrainerTTS: use_cuda, num_gpus = setup_torch_training_env(True, False) - def __init__(self, - args: Union[Coqpit, Namespace], - config: Coqpit, - c_logger: ConsoleLogger, - tb_logger: TensorboardLogger, - model: nn.Module = None, - output_path: str = None) -> None: + def __init__( + self, + args: Union[Coqpit, Namespace], + config: Coqpit, + c_logger: ConsoleLogger, + tb_logger: TensorboardLogger, + model: nn.Module = None, + output_path: str = None, + ) -> None: self.args = args self.config = config self.c_logger = c_logger @@ -90,8 +87,7 @@ class TrainerTTS: self.keep_avg_train = None self.keep_avg_eval = None - log_file = os.path.join(self.output_path, - f"trainer_{args.rank}_log.txt") + log_file = os.path.join(self.output_path, f"trainer_{args.rank}_log.txt") self._setup_logger_config(log_file) # model, audio processor, datasets, loss @@ -106,16 +102,19 @@ class TrainerTTS: # default speaker manager self.speaker_manager = self.get_speaker_manager( - self.config, args.restore_path, self.config.output_path, self.data_train) + self.config, args.restore_path, self.config.output_path, self.data_train + ) # init TTS model if model is not None: self.model = model else: self.model = self.get_model( - len(self.model_characters), self.speaker_manager.num_speakers, - self.config, self.speaker_manager.x_vector_dim - if self.speaker_manager.x_vectors else None) + len(self.model_characters), + self.speaker_manager.num_speakers, + self.config, + self.speaker_manager.x_vector_dim if self.speaker_manager.x_vectors else None, + ) # setup criterion self.criterion = self.get_criterion(self.config) @@ -126,13 +125,16 @@ class TrainerTTS: # DISTRUBUTED if self.num_gpus > 1: - init_distributed(args.rank, self.num_gpus, args.group_id, - self.config.distributed["backend"], - self.config.distributed["url"]) + init_distributed( + args.rank, + self.num_gpus, + args.group_id, + self.config.distributed["backend"], + self.config.distributed["url"], + ) # scalers for mixed precision training - self.scaler = torch.cuda.amp.GradScaler( - ) if self.config.mixed_precision and self.use_cuda else None + self.scaler = torch.cuda.amp.GradScaler() if self.config.mixed_precision and self.use_cuda else None # setup optimizer self.optimizer = self.get_optimizer(self.model, self.config) @@ -154,8 +156,7 @@ class TrainerTTS: print("\n > Model has {} parameters".format(num_params)) @staticmethod - def get_model(num_chars: int, num_speakers: int, config: Coqpit, - x_vector_dim: int) -> nn.Module: + def get_model(num_chars: int, num_speakers: int, config: Coqpit, x_vector_dim: int) -> nn.Module: model = setup_model(num_chars, num_speakers, config, x_vector_dim) return model @@ -182,26 +183,32 @@ class TrainerTTS: return model_characters @staticmethod - def get_speaker_manager(config: Coqpit, - restore_path: str = "", - out_path: str = "", - data_train: List = []) -> SpeakerManager: + def get_speaker_manager( + config: Coqpit, restore_path: str = "", out_path: str = "", data_train: List = [] + ) -> SpeakerManager: speaker_manager = SpeakerManager() if restore_path: - speakers_file = os.path.join(os.path.dirname(restore_path), - "speaker.json") + speakers_file = os.path.join(os.path.dirname(restore_path), "speaker.json") if not os.path.exists(speakers_file): print( "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" ) speakers_file = config.external_speaker_embedding_file + if config.use_external_speaker_embedding_file: + speaker_manager.load_x_vectors_file(speakers_file) + else: + speaker_manager.load_ids_file(speakers_file) + elif config.use_external_speaker_embedding_file and config.external_speaker_embedding_file: + speaker_manager.load_x_vectors_file(config.external_speaker_embedding_file) + else: + speaker_manager.parse_speakers_from_items(data_train) + file_path = os.path.join(out_path, "speakers.json") speaker_manager.save_ids_file(file_path) return speaker_manager @staticmethod - def get_scheduler(config: Coqpit, - optimizer: torch.optim.Optimizer) -> torch.optim.lr_scheduler._LRScheduler: + def get_scheduler(config: Coqpit, optimizer: torch.optim.Optimizer) -> torch.optim.lr_scheduler._LRScheduler: lr_scheduler = config.lr_scheduler lr_scheduler_params = config.lr_scheduler_params if lr_scheduler is None: @@ -224,7 +231,7 @@ class TrainerTTS: restore_path: str, model: nn.Module, optimizer: torch.optim.Optimizer, - scaler: torch.cuda.amp.GradScaler = None + scaler: torch.cuda.amp.GradScaler = None, ) -> Tuple[nn.Module, torch.optim.Optimizer, torch.cuda.amp.GradScaler, int]: print(" > Restoring from %s ..." % os.path.basename(restore_path)) checkpoint = torch.load(restore_path) @@ -245,13 +252,21 @@ class TrainerTTS: for group in optimizer.param_groups: group["lr"] = self.config.lr - print(" > Model restored from step %d" % checkpoint["step"], ) + print( + " > Model restored from step %d" % checkpoint["step"], + ) restore_step = checkpoint["step"] return model, optimizer, scaler, restore_step - def _get_loader(self, r: int, ap: AudioProcessor, is_eval: bool, - data_items: List, verbose: bool, - speaker_mapping: Union[Dict, List]) -> DataLoader: + def _get_loader( + self, + r: int, + ap: AudioProcessor, + is_eval: bool, + data_items: List, + verbose: bool, + speaker_mapping: Union[Dict, List], + ) -> DataLoader: if is_eval and not self.config.run_eval: loader = None else: @@ -295,17 +310,15 @@ class TrainerTTS: ) return loader - def get_train_dataloader(self, r: int, ap: AudioProcessor, - data_items: List, verbose: bool, - speaker_mapping: Union[List, Dict]) -> DataLoader: - return self._get_loader(r, ap, False, data_items, verbose, - speaker_mapping) + def get_train_dataloader( + self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_mapping: Union[List, Dict] + ) -> DataLoader: + return self._get_loader(r, ap, False, data_items, verbose, speaker_mapping) - def get_eval_dataloder(self, r: int, ap: AudioProcessor, data_items: List, - verbose: bool, - speaker_mapping: Union[List, Dict]) -> DataLoader: - return self._get_loader(r, ap, True, data_items, verbose, - speaker_mapping) + def get_eval_dataloder( + self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_mapping: Union[List, Dict] + ) -> DataLoader: + return self._get_loader(r, ap, True, data_items, verbose, speaker_mapping) def format_batch(self, batch: List) -> Dict: # setup input batch @@ -390,8 +403,7 @@ class TrainerTTS: "item_idx": item_idx, } - def train_step(self, batch: Dict, batch_n_steps: int, step: int, - loader_start_time: float) -> Tuple[Dict, Dict]: + def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_time: float) -> Tuple[Dict, Dict]: self.on_train_step_start() step_start_time = time.time() @@ -560,7 +572,9 @@ class TrainerTTS: self.tb_logger.tb_eval_figures(self.total_steps_done, figures) self.tb_logger.tb_eval_audios(self.total_steps_done, {"EvalAudio": eval_audios}, self.ap.sample_rate) - def test_run(self, ) -> None: + def test_run( + self, + ) -> None: print(" | > Synthesizing test sentences.") test_audios = {} test_figures = {} @@ -581,28 +595,26 @@ class TrainerTTS: do_trim_silence=False, ).values() - file_path = os.path.join(self.output_audio_path, - str(self.total_steps_done)) + file_path = os.path.join(self.output_audio_path, str(self.total_steps_done)) os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, - "TestSentence_{}.wav".format(idx)) + file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) self.ap.save_wav(wav, file_path) test_audios["{}-audio".format(idx)] = wav test_figures["{}-prediction".format(idx)] = plot_spectrogram(model_outputs, self.ap, output_fig=False) test_figures["{}-alignment".format(idx)] = plot_alignment(alignment, output_fig=False) - self.tb_logger.tb_test_audios(self.total_steps_done, test_audios, - self.config.audio["sample_rate"]) + self.tb_logger.tb_test_audios(self.total_steps_done, test_audios, self.config.audio["sample_rate"]) self.tb_logger.tb_test_figures(self.total_steps_done, test_figures) def _get_cond_inputs(self) -> Dict: # setup speaker_id speaker_id = 0 if self.config.use_speaker_embedding else None # setup x_vector - x_vector = (self.speaker_manager.get_x_vectors_by_speaker( - self.speaker_manager.speaker_ids[0]) - if self.config.use_external_speaker_embedding_file - and self.config.use_speaker_embedding else None) + x_vector = ( + self.speaker_manager.get_x_vectors_by_speaker(self.speaker_manager.speaker_ids[0]) + if self.config.use_external_speaker_embedding_file and self.config.use_speaker_embedding + else None + ) # setup style_mel if self.config.has("gst_style_input"): style_wav = self.config.gst_style_input @@ -611,40 +623,29 @@ class TrainerTTS: if style_wav is None and "use_gst" in self.config and self.config.use_gst: # inicialize GST with zero dict. style_wav = {} - print( - "WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!" - ) + print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") for i in range(self.config.gst["gst_num_style_tokens"]): style_wav[str(i)] = 0 - cond_inputs = { - "speaker_id": speaker_id, - "style_wav": style_wav, - "x_vector": x_vector - } + cond_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "x_vector": x_vector} return cond_inputs def fit(self) -> None: if self.restore_step != 0 or self.args.best_path: - print(" > Restoring best loss from " - f"{os.path.basename(self.args.best_path)} ...") - self.best_loss = torch.load(self.args.best_path, - map_location="cpu")["model_loss"] + print(" > Restoring best loss from " f"{os.path.basename(self.args.best_path)} ...") + self.best_loss = torch.load(self.args.best_path, map_location="cpu")["model_loss"] print(f" > Starting with loaded last best loss {self.best_loss}.") # define data loaders self.train_loader = self.get_train_dataloader( - self.config.r, - self.ap, - self.data_train, - verbose=True, - speaker_mapping=self.speaker_manager.speaker_ids) - self.eval_loader = (self.get_eval_dataloder( - self.config.r, - self.ap, - self.data_train, - verbose=True, - speaker_mapping=self.speaker_manager.speaker_ids) - if self.config.run_eval else None) + self.config.r, self.ap, self.data_train, verbose=True, speaker_mapping=self.speaker_manager.speaker_ids + ) + self.eval_loader = ( + self.get_eval_dataloder( + self.config.r, self.ap, self.data_train, verbose=True, speaker_mapping=self.speaker_manager.speaker_ids + ) + if self.config.run_eval + else None + ) self.total_steps_done = self.restore_step @@ -667,8 +668,7 @@ class TrainerTTS: def save_best_model(self) -> None: self.best_loss = save_best_model( - self.keep_avg_eval["avg_loss"] - if self.keep_avg_eval else self.keep_avg_train["avg_loss"], + self.keep_avg_eval["avg_loss"] if self.keep_avg_eval else self.keep_avg_train["avg_loss"], self.best_loss, self.model, self.optimizer, @@ -685,10 +685,8 @@ class TrainerTTS: @staticmethod def _setup_logger_config(log_file: str) -> None: logging.basicConfig( - level=logging.INFO, - format="", - handlers=[logging.FileHandler(log_file), - logging.StreamHandler()]) + level=logging.INFO, format="", handlers=[logging.FileHandler(log_file), logging.StreamHandler()] + ) def on_epoch_start(self) -> None: # pylint: disable=no-self-use if hasattr(self.model, "on_epoch_start"): diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index 69ab871d..bcdbf6a6 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -1,9 +1,11 @@ import sys -import numpy as np from collections import Counter from pathlib import Path -from TTS.tts.datasets.TTSDataset import TTSDataset + +import numpy as np + from TTS.tts.datasets.formatters import * +from TTS.tts.datasets.TTSDataset import TTSDataset #################### # UTILITIES diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index f43733b1..815a1b1d 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -7,7 +7,6 @@ from typing import List from tqdm import tqdm - ######################## # DATASETS ######################## diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index f94d9ca6..e8f80251 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -4,13 +4,13 @@ import torch.nn as nn from TTS.tts.layers.align_tts.mdn import MDNBlock from TTS.tts.layers.feed_forward.decoder import Decoder from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor -from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.audio import AudioProcessor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor class AlignTTS(nn.Module): diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index e1c07212..8cf19f79 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -6,11 +6,11 @@ from torch.nn import functional as F from TTS.tts.layers.glow_tts.decoder import Decoder from TTS.tts.layers.glow_tts.encoder import Encoder +from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path +from TTS.tts.utils.data import sequence_mask from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor -from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path -from TTS.tts.utils.data import sequence_mask class GlowTTS(nn.Module): diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index 69070ffa..f00af9ad 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -3,13 +3,13 @@ from torch import nn from TTS.tts.layers.feed_forward.decoder import Decoder from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor -from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.audio import AudioProcessor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor class SpeedySpeech(nn.Module): diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 19af28ff..6059a0d2 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -2,11 +2,11 @@ import torch from torch import nn -from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.tts.layers.tacotron.gst_layers import GST from TTS.tts.layers.tacotron.tacotron import Decoder, Encoder, PostCBHG from TTS.tts.models.tacotron_abstract import TacotronAbstract +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram class Tacotron(TacotronAbstract): diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 4e111fda..b39a9d6f 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -3,11 +3,11 @@ import numpy as np import torch from torch import nn -from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.tts.layers.tacotron.gst_layers import GST from TTS.tts.layers.tacotron.tacotron2 import Decoder, Encoder, Postnet from TTS.tts.models.tacotron_abstract import TacotronAbstract +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram class Tacotron2(TacotronAbstract): diff --git a/TTS/tts/utils/data.py b/TTS/tts/utils/data.py index 5f8624e6..3ff52195 100644 --- a/TTS/tts/utils/data.py +++ b/TTS/tts/utils/data.py @@ -1,5 +1,5 @@ -import torch import numpy as np +import torch def _pad_data(x, length): diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 374139ee..4bfe8299 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -1,7 +1,7 @@ import json import os import random -from typing import Union, List, Any +from typing import Any, List, Union import numpy as np import torch diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 90abd3b5..9d92ae82 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -11,9 +11,9 @@ import torch from TTS.config import load_config from TTS.tts.utils.text.symbols import parse_symbols -from TTS.utils.logging import ConsoleLogger, TensorboardLogger from TTS.utils.generic_utils import create_experiment_folder, get_git_branch from TTS.utils.io import copy_model_files +from TTS.utils.logging import ConsoleLogger, TensorboardLogger def init_arguments(argv): From 8e52a6923035c729244532d506b557465f2a5927 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 May 2021 14:03:02 +0200 Subject: [PATCH 156/258] delete separate tts training scripts and pre-commit configuration --- .pre-commit-config.yaml | 17 +- TTS/bin/train_align_tts.py | 572 ------------------------- TTS/bin/train_glow_tts.py | 598 -------------------------- TTS/bin/train_speedy_speech.py | 578 ------------------------- TTS/bin/train_tacotron.py | 749 --------------------------------- 5 files changed, 16 insertions(+), 2498 deletions(-) delete mode 100644 TTS/bin/train_align_tts.py delete mode 100644 TTS/bin/train_glow_tts.py delete mode 100644 TTS/bin/train_speedy_speech.py delete mode 100755 TTS/bin/train_tacotron.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1ae28644..a70572dc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,4 +9,19 @@ repos: rev: 20.8b1 hooks: - id: black - language_version: python3 \ No newline at end of file + language_version: python3 + - repo: https://github.com/pycqa/isort + rev: 5.8.0 + hooks: + - id: isort + name: isort (python) + - id: isort + name: isort (cython) + types: [cython] + - id: isort + name: isort (pyi) + types: [pyi] + - repo: https://github.com/pycqa/pylint + rev: v2.8.2 + hooks: + - id: pylint diff --git a/TTS/bin/train_align_tts.py b/TTS/bin/train_align_tts.py deleted file mode 100644 index 34eba7a8..00000000 --- a/TTS/bin/train_align_tts.py +++ /dev/null @@ -1,572 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import os -import sys -import time -import traceback -from random import randrange - -import numpy as np -import torch -from torch.nn.parallel import DistributedDataParallel as DDP_th -from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler - -from TTS.tts.datasets import load_meta_data -from TTS.tts.datasets.TTSDataset import TTSDataset -from TTS.tts.layers.losses import AlignTTSLoss -from TTS.tts.models import setup_model -from TTS.tts.utils.io import save_best_model, save_checkpoint -from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.speakers import parse_speakers -from TTS.tts.utils.synthesis import synthesis -from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.arguments import init_training -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import init_distributed, reduce_tensor -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.radam import RAdam -from TTS.utils.training import NoamLR, setup_torch_training_env - -use_cuda, num_gpus = setup_torch_training_env(True, False) -# torch.autograd.set_detect_anomaly(True) - - -def setup_loader(ap, r, is_val=False, verbose=False): - if is_val and not config.run_eval: - loader = None - else: - dataset = TTSDataset( - r, - config.text_cleaner, - compute_linear_spec=False, - meta_data=meta_data_eval if is_val else meta_data_train, - ap=ap, - tp=config.characters, - add_blank=config["add_blank"], - batch_group_size=0 if is_val else config.batch_group_size * config.batch_size, - min_seq_len=config.min_seq_len, - max_seq_len=config.max_seq_len, - phoneme_cache_path=config.phoneme_cache_path, - use_phonemes=config.use_phonemes, - phoneme_language=config.phoneme_language, - enable_eos_bos=config.enable_eos_bos_chars, - use_noise_augment=not is_val, - verbose=verbose, - speaker_mapping=speaker_mapping - if config.use_speaker_embedding and config.use_external_speaker_embedding_file - else None, - ) - - if config.use_phonemes and config.compute_input_seq_cache: - # precompute phonemes to have a better estimate of sequence lengths. - dataset.compute_input_seq(config.num_loader_workers) - dataset.sort_items() - - sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=config.eval_batch_size if is_val else config.batch_size, - shuffle=False, - collate_fn=dataset.collate_fn, - drop_last=False, - sampler=sampler, - num_workers=config.num_val_loader_workers if is_val else config.num_loader_workers, - pin_memory=False, - ) - return loader - - -def format_data(data): - # setup input data - text_input = data[0] - text_lengths = data[1] - speaker_names = data[2] - mel_input = data[4].permute(0, 2, 1) # B x D x T - mel_lengths = data[5] - item_idx = data[7] - avg_text_length = torch.mean(text_lengths.float()) - avg_spec_length = torch.mean(mel_lengths.float()) - - if config.use_speaker_embedding: - if config.use_external_speaker_embedding_file: - # return precomputed embedding vector - speaker_c = data[8] - else: - # return speaker_id to be used by an embedding layer - speaker_c = [speaker_mapping[speaker_name] for speaker_name in speaker_names] - speaker_c = torch.LongTensor(speaker_c) - else: - speaker_c = None - # dispatch data to GPU - if use_cuda: - text_input = text_input.cuda(non_blocking=True) - text_lengths = text_lengths.cuda(non_blocking=True) - mel_input = mel_input.cuda(non_blocking=True) - mel_lengths = mel_lengths.cuda(non_blocking=True) - if speaker_c is not None: - speaker_c = speaker_c.cuda(non_blocking=True) - return text_input, text_lengths, mel_input, mel_lengths, speaker_c, avg_text_length, avg_spec_length, item_idx - - -def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch, training_phase): - - model.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (config.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / config.batch_size) - end_time = time.time() - c_logger.print_train_start() - scaler = torch.cuda.amp.GradScaler() if config.mixed_precision else None - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - ( - text_input, - text_lengths, - mel_targets, - mel_lengths, - speaker_c, - avg_text_length, - avg_spec_length, - _, - ) = format_data(data) - - loader_time = time.time() - end_time - - global_step += 1 - optimizer.zero_grad() - - # forward pass model - with torch.cuda.amp.autocast(enabled=config.mixed_precision): - decoder_output, dur_output, dur_mas_output, alignments, _, _, logp = model.forward( - text_input, text_lengths, mel_targets, mel_lengths, g=speaker_c, phase=training_phase - ) - - # compute loss - loss_dict = criterion( - logp, - decoder_output, - mel_targets, - mel_lengths, - dur_output, - dur_mas_output, - text_lengths, - global_step, - phase=training_phase, - ) - - # backward pass with loss scaling - if config.mixed_precision: - scaler.scale(loss_dict["loss"]).backward() - scaler.unscale_(optimizer) - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) - scaler.step(optimizer) - scaler.update() - else: - loss_dict["loss"].backward() - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) - optimizer.step() - - # setup lr - if config.noam_schedule: - scheduler.step() - - # current_lr - current_lr = optimizer.param_groups[0]["lr"] - - # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(alignments, binary=True) - loss_dict["align_error"] = align_error - - step_time = time.time() - start_time - epoch_time += step_time - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["loss_l1"] = reduce_tensor(loss_dict["loss_l1"].data, num_gpus) - loss_dict["loss_ssim"] = reduce_tensor(loss_dict["loss_ssim"].data, num_gpus) - loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - keep_avg.update_values(update_train_values) - - # print training progress - if global_step % config.print_step == 0: - log_dict = { - "avg_spec_length": [avg_spec_length, 1], # value, precision - "avg_text_length": [avg_text_length, 1], - "step_time": [step_time, 4], - "loader_time": [loader_time, 2], - "current_lr": current_lr, - } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Plot Training Iter Stats - # reduce TB load - if global_step % config.tb_plot_step == 0: - iter_stats = {"lr": current_lr, "grad_norm": grad_norm, "step_time": step_time} - iter_stats.update(loss_dict) - tb_logger.tb_train_step_stats(global_step, iter_stats) - - if global_step % config.save_step == 0: - if config.checkpoint: - # save model - save_checkpoint( - model, - optimizer, - global_step, - epoch, - 1, - OUT_PATH, - model_characters, - model_loss=loss_dict["loss"], - ) - - # wait all kernels to be completed - torch.cuda.synchronize() - - # Diagnostic visualizations - if decoder_output is not None: - idx = np.random.randint(mel_targets.shape[0]) - pred_spec = decoder_output[idx].detach().data.cpu().numpy().T - gt_spec = mel_targets[idx].data.cpu().numpy().T - align_img = alignments[idx].data.cpu() - - figures = { - "prediction": plot_spectrogram(pred_spec, ap), - "ground_truth": plot_spectrogram(gt_spec, ap), - "alignment": plot_alignment(align_img), - } - - tb_logger.tb_train_figures(global_step, figures) - - # Sample audio - train_audio = ap.inv_melspectrogram(pred_spec.T) - tb_logger.tb_train_audios(global_step, {"TrainAudio": train_audio}, config.audio["sample_rate"]) - end_time = time.time() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Epoch Stats - if args.rank == 0: - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - if config.tb_model_param_stats: - tb_logger.tb_model_weights(model, global_step) - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(data_loader, model, criterion, ap, global_step, epoch, training_phase): - model.eval() - epoch_time = 0 - keep_avg = KeepAverage() - c_logger.print_eval_start() - if data_loader is not None: - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - text_input, text_lengths, mel_targets, mel_lengths, speaker_c, _, _, _ = format_data(data) - - # forward pass model - with torch.cuda.amp.autocast(enabled=config.mixed_precision): - decoder_output, dur_output, dur_mas_output, alignments, _, _, logp = model.forward( - text_input, text_lengths, mel_targets, mel_lengths, g=speaker_c, phase=training_phase - ) - - # compute loss - loss_dict = criterion( - logp, - decoder_output, - mel_targets, - mel_lengths, - dur_output, - dur_mas_output, - text_lengths, - global_step, - phase=training_phase, - ) - - # step time - step_time = time.time() - start_time - epoch_time += step_time - - # compute alignment score - align_error = 1 - alignment_diagonal_score(alignments, binary=True) - loss_dict["align_error"] = align_error - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["loss_l1"] = reduce_tensor(loss_dict["loss_l1"].data, num_gpus) - loss_dict["loss_ssim"] = reduce_tensor(loss_dict["loss_ssim"].data, num_gpus) - loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - keep_avg.update_values(update_train_values) - - if config.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Diagnostic visualizations - idx = np.random.randint(mel_targets.shape[0]) - pred_spec = decoder_output[idx].detach().data.cpu().numpy().T - gt_spec = mel_targets[idx].data.cpu().numpy().T - align_img = alignments[idx].data.cpu() - - eval_figures = { - "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), - "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), - "alignment": plot_alignment(align_img, output_fig=False), - } - - # Sample audio - eval_audio = ap.inv_melspectrogram(pred_spec.T) - tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, config.audio["sample_rate"]) - - # Plot Validation Stats - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - tb_logger.tb_eval_figures(global_step, eval_figures) - - if args.rank == 0 and epoch >= config.test_delay_epochs: - if config.test_sentences_file: - with open(config.test_sentences_file, "r") as f: - test_sentences = [s.strip() for s in f.readlines()] - else: - test_sentences = [ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963.", - ] - - # test sentences - test_audios = {} - test_figures = {} - print(" | > Synthesizing test sentences") - if config.use_speaker_embedding: - if config.use_external_speaker_embedding_file: - speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[randrange(len(speaker_mapping) - 1)]][ - "embedding" - ] - speaker_id = None - else: - speaker_id = 0 - speaker_embedding = None - else: - speaker_id = None - speaker_embedding = None - - for idx, test_sentence in enumerate(test_sentences): - try: - wav, alignment, _, postnet_output, _, _ = synthesis( - model, - test_sentence, - config, - use_cuda, - ap, - speaker_id=speaker_id, - speaker_embedding=speaker_embedding, - style_wav=None, - truncated=False, - enable_eos_bos_chars=config.enable_eos_bos_chars, # pylint: disable=unused-argument - use_griffin_lim=True, - do_trim_silence=False, - ) - - file_path = os.path.join(AUDIO_PATH, str(global_step)) - os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) - ap.save_wav(wav, file_path) - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-prediction".format(idx)] = plot_spectrogram(postnet_output, ap) - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment) - except: # pylint: disable=bare-except - print(" !! Error creating Test Sentence -", idx) - traceback.print_exc() - tb_logger.tb_test_audios(global_step, test_audios, config.audio["sample_rate"]) - tb_logger.tb_test_figures(global_step, test_figures) - return keep_avg.avg_values - - -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global meta_data_train, meta_data_eval, symbols, phonemes, model_characters, speaker_mapping - # Audio processor - ap = AudioProcessor(**config.audio.to_dict()) - if config.has("characters") and config.characters: - symbols, phonemes = make_symbols(**config.characters.to_dict()) - - # DISTRUBUTED - if num_gpus > 1: - init_distributed(args.rank, num_gpus, args.group_id, config.distributed["backend"], config.distributed["url"]) - - # set model characters - model_characters = phonemes if config.use_phonemes else symbols - num_chars = len(model_characters) - - # load data instances - meta_data_train, meta_data_eval = load_meta_data(config.datasets, eval_split=True) - - # parse speakers - num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(config, args, meta_data_train, OUT_PATH) - - # setup model - model = setup_model(num_chars, num_speakers, config, speaker_embedding_dim=speaker_embedding_dim) - optimizer = RAdam(model.parameters(), lr=config.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9) - criterion = AlignTTSLoss(config) - - if args.restore_path: - print(f" > Restoring from {os.path.basename(args.restore_path)} ...") - checkpoint = torch.load(args.restore_path, map_location="cpu") - try: - # TODO: fix optimizer init, model.cuda() needs to be called before - # optimizer restore - optimizer.load_state_dict(checkpoint["optimizer"]) - if config.reinit_layers: - raise RuntimeError - model.load_state_dict(checkpoint["model"]) - except: # pylint: disable=bare-except - print(" > Partial model initialization.") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], config) - model.load_state_dict(model_dict) - del model_dict - - for group in optimizer.param_groups: - group["initial_lr"] = config.lr - print(" > Model restored from step %d" % checkpoint["step"], flush=True) - args.restore_step = checkpoint["step"] - else: - args.restore_step = 0 - - if use_cuda: - model.cuda() - criterion.cuda() - - # DISTRUBUTED - if num_gpus > 1: - model = DDP_th(model, device_ids=[args.rank]) - - if config.noam_schedule: - scheduler = NoamLR(optimizer, warmup_steps=config.warmup_steps, last_epoch=args.restore_step - 1) - else: - scheduler = None - - num_params = count_parameters(model) - print("\n > Model has {} parameters".format(num_params), flush=True) - - if args.restore_step == 0 or not args.best_path: - best_loss = float("inf") - print(" > Starting with inf best loss.") - else: - print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {best_loss}.") - keep_all_best = config.keep_all_best - keep_after = config.keep_after # void if keep_all_best False - - # define dataloaders - train_loader = setup_loader(ap, 1, is_val=False, verbose=True) - eval_loader = setup_loader(ap, 1, is_val=True, verbose=True) - - global_step = args.restore_step - - def set_phase(): - """Set AlignTTS training phase""" - if isinstance(config.phase_start_steps, list): - vals = [i < global_step for i in config.phase_start_steps] - if not True in vals: - phase = 0 - else: - phase = ( - len(config.phase_start_steps) - - [i < global_step for i in config.phase_start_steps][::-1].index(True) - - 1 - ) - else: - phase = None - return phase - - for epoch in range(0, config.epochs): - cur_phase = set_phase() - print(f"\n > Current AlignTTS phase: {cur_phase}") - c_logger.print_epoch_start(epoch, config.epochs) - train_avg_loss_dict, global_step = train( - train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch, cur_phase - ) - eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch, cur_phase) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = train_avg_loss_dict["avg_loss"] - if config.run_eval: - target_loss = eval_avg_loss_dict["avg_loss"] - best_loss = save_best_model( - target_loss, - best_loss, - model, - optimizer, - global_step, - epoch, - 1, - OUT_PATH, - model_characters, - keep_all_best=keep_all_best, - keep_after=keep_after, - ) - - -if __name__ == "__main__": - args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py deleted file mode 100644 index a138abeb..00000000 --- a/TTS/bin/train_glow_tts.py +++ /dev/null @@ -1,598 +0,0 @@ -#!/usr/bin/env python3 -"""Train Glow TTS model.""" - -import os -import sys -import time -import traceback -from random import randrange - -import torch - -# DISTRIBUTED -from torch.nn.parallel import DistributedDataParallel as DDP_th -from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler - -from TTS.tts.datasets import load_meta_data -from TTS.tts.datasets.TTSDataset import TTSDataset -from TTS.tts.layers.losses import GlowTTSLoss -from TTS.tts.models import setup_model -from TTS.tts.utils.io import save_best_model, save_checkpoint -from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.speakers import parse_speakers -from TTS.tts.utils.synthesis import synthesis -from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.arguments import init_training -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import init_distributed, reduce_tensor -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.radam import RAdam -from TTS.utils.training import NoamLR, setup_torch_training_env - -use_cuda, num_gpus = setup_torch_training_env(True, False) - - -def setup_loader(ap, r, is_val=False, verbose=False): - if is_val and not config.run_eval: - loader = None - else: - dataset = TTSDataset( - r, - config.text_cleaner, - compute_linear_spec=False, - meta_data=meta_data_eval if is_val else meta_data_train, - ap=ap, - tp=config.characters, - add_blank=config["add_blank"], - batch_group_size=0 if is_val else config.batch_group_size * config.batch_size, - min_seq_len=config.min_seq_len, - max_seq_len=config.max_seq_len, - phoneme_cache_path=config.phoneme_cache_path, - use_phonemes=config.use_phonemes, - phoneme_language=config.phoneme_language, - enable_eos_bos=config.enable_eos_bos_chars, - use_noise_augment=not is_val, - verbose=verbose, - speaker_mapping=speaker_mapping - if config.use_speaker_embedding and config.use_external_speaker_embedding_file - else None, - ) - - if config.use_phonemes and config.compute_input_seq_cache: - # precompute phonemes to have a better estimate of sequence lengths. - dataset.compute_input_seq(config.num_loader_workers) - dataset.sort_items() - - sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=config.eval_batch_size if is_val else config.batch_size, - shuffle=False, - collate_fn=dataset.collate_fn, - drop_last=False, - sampler=sampler, - num_workers=config.num_val_loader_workers if is_val else config.num_loader_workers, - pin_memory=False, - ) - return loader - - -def format_data(data): - # setup input data - text_input = data[0] - text_lengths = data[1] - speaker_names = data[2] - mel_input = data[4].permute(0, 2, 1) # B x D x T - mel_lengths = data[5] - item_idx = data[7] - attn_mask = data[9] - avg_text_length = torch.mean(text_lengths.float()) - avg_spec_length = torch.mean(mel_lengths.float()) - - if config.use_speaker_embedding: - if config.use_external_speaker_embedding_file: - # return precomputed embedding vector - speaker_c = data[8] - else: - # return speaker_id to be used by an embedding layer - speaker_c = [speaker_mapping[speaker_name] for speaker_name in speaker_names] - speaker_c = torch.LongTensor(speaker_c) - else: - speaker_c = None - - # dispatch data to GPU - if use_cuda: - text_input = text_input.cuda(non_blocking=True) - text_lengths = text_lengths.cuda(non_blocking=True) - mel_input = mel_input.cuda(non_blocking=True) - mel_lengths = mel_lengths.cuda(non_blocking=True) - if speaker_c is not None: - speaker_c = speaker_c.cuda(non_blocking=True) - if attn_mask is not None: - attn_mask = attn_mask.cuda(non_blocking=True) - return ( - text_input, - text_lengths, - mel_input, - mel_lengths, - speaker_c, - avg_text_length, - avg_spec_length, - attn_mask, - item_idx, - ) - - -def data_depended_init(data_loader, model): - """Data depended initialization for activation normalization.""" - if hasattr(model, "module"): - for f in model.module.decoder.flows: - if getattr(f, "set_ddi", False): - f.set_ddi(True) - else: - for f in model.decoder.flows: - if getattr(f, "set_ddi", False): - f.set_ddi(True) - - model.train() - print(" > Data depended initialization ... ") - num_iter = 0 - with torch.no_grad(): - for _, data in enumerate(data_loader): - - # format data - text_input, text_lengths, mel_input, mel_lengths, spekaer_embed, _, _, attn_mask, _ = format_data(data) - - # forward pass model - _ = model.forward(text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=spekaer_embed) - if num_iter == config.data_dep_init_steps: - break - num_iter += 1 - - if hasattr(model, "module"): - for f in model.module.decoder.flows: - if getattr(f, "set_ddi", False): - f.set_ddi(False) - else: - for f in model.decoder.flows: - if getattr(f, "set_ddi", False): - f.set_ddi(False) - return model - - -def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch): - - model.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (config.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / config.batch_size) - end_time = time.time() - c_logger.print_train_start() - scaler = torch.cuda.amp.GradScaler() if config.mixed_precision else None - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - ( - text_input, - text_lengths, - mel_input, - mel_lengths, - speaker_c, - avg_text_length, - avg_spec_length, - attn_mask, - _, - ) = format_data(data) - - loader_time = time.time() - end_time - - global_step += 1 - optimizer.zero_grad() - - # forward pass model - with torch.cuda.amp.autocast(enabled=config.mixed_precision): - z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward( - text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_c - ) - - # compute loss - loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, o_dur_log, o_total_dur, text_lengths) - - # backward pass with loss scaling - if config.mixed_precision: - scaler.scale(loss_dict["loss"]).backward() - scaler.unscale_(optimizer) - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) - scaler.step(optimizer) - scaler.update() - else: - loss_dict["loss"].backward() - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) - optimizer.step() - - # setup lr - if config.noam_schedule: - scheduler.step() - - # current_lr - current_lr = optimizer.param_groups[0]["lr"] - - # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(alignments, binary=True) - loss_dict["align_error"] = align_error - - step_time = time.time() - start_time - epoch_time += step_time - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["log_mle"] = reduce_tensor(loss_dict["log_mle"].data, num_gpus) - loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - keep_avg.update_values(update_train_values) - - # print training progress - if global_step % config.print_step == 0: - log_dict = { - "avg_spec_length": [avg_spec_length, 1], # value, precision - "avg_text_length": [avg_text_length, 1], - "step_time": [step_time, 4], - "loader_time": [loader_time, 2], - "current_lr": current_lr, - } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Plot Training Iter Stats - # reduce TB load - if global_step % config.tb_plot_step == 0: - iter_stats = {"lr": current_lr, "grad_norm": grad_norm, "step_time": step_time} - iter_stats.update(loss_dict) - tb_logger.tb_train_step_stats(global_step, iter_stats) - - if global_step % config.save_step == 0: - if config.checkpoint: - # save model - save_checkpoint( - model, - optimizer, - global_step, - epoch, - 1, - OUT_PATH, - model_characters, - model_loss=loss_dict["loss"], - ) - - # wait all kernels to be completed - torch.cuda.synchronize() - - # Diagnostic visualizations - # direct pass on model for spec predictions - target_speaker = None if speaker_c is None else speaker_c[:1] - - if hasattr(model, "module"): - spec_pred, *_ = model.module.inference(text_input[:1], text_lengths[:1], g=target_speaker) - else: - spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1], g=target_speaker) - - spec_pred = spec_pred.permute(0, 2, 1) - gt_spec = mel_input.permute(0, 2, 1) - const_spec = spec_pred[0].data.cpu().numpy() - gt_spec = gt_spec[0].data.cpu().numpy() - align_img = alignments[0].data.cpu().numpy() - - figures = { - "prediction": plot_spectrogram(const_spec, ap), - "ground_truth": plot_spectrogram(gt_spec, ap), - "alignment": plot_alignment(align_img), - } - - tb_logger.tb_train_figures(global_step, figures) - - # Sample audio - train_audio = ap.inv_melspectrogram(const_spec.T) - tb_logger.tb_train_audios(global_step, {"TrainAudio": train_audio}, config.audio["sample_rate"]) - end_time = time.time() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Epoch Stats - if args.rank == 0: - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - if config.tb_model_param_stats: - tb_logger.tb_model_weights(model, global_step) - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(data_loader, model, criterion, ap, global_step, epoch): - model.eval() - epoch_time = 0 - keep_avg = KeepAverage() - c_logger.print_eval_start() - if data_loader is not None: - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - text_input, text_lengths, mel_input, mel_lengths, speaker_c, _, _, attn_mask, _ = format_data(data) - - # forward pass model - z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward( - text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_c - ) - - # compute loss - loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, o_dur_log, o_total_dur, text_lengths) - - # step time - step_time = time.time() - start_time - epoch_time += step_time - - # compute alignment score - align_error = 1 - alignment_diagonal_score(alignments) - loss_dict["align_error"] = align_error - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["log_mle"] = reduce_tensor(loss_dict["log_mle"].data, num_gpus) - loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - keep_avg.update_values(update_train_values) - - if config.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Diagnostic visualizations - # direct pass on model for spec predictions - target_speaker = None if speaker_c is None else speaker_c[:1] - if hasattr(model, "module"): - spec_pred, *_ = model.module.inference(text_input[:1], text_lengths[:1], g=target_speaker) - else: - spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1], g=target_speaker) - spec_pred = spec_pred.permute(0, 2, 1) - gt_spec = mel_input.permute(0, 2, 1) - - const_spec = spec_pred[0].data.cpu().numpy() - gt_spec = gt_spec[0].data.cpu().numpy() - align_img = alignments[0].data.cpu().numpy() - - eval_figures = { - "prediction": plot_spectrogram(const_spec, ap), - "ground_truth": plot_spectrogram(gt_spec, ap), - "alignment": plot_alignment(align_img), - } - - # Sample audio - eval_audio = ap.inv_melspectrogram(const_spec.T) - tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, config.audio["sample_rate"]) - - # Plot Validation Stats - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - tb_logger.tb_eval_figures(global_step, eval_figures) - - if args.rank == 0 and epoch >= config.test_delay_epochs: - if config.test_sentences_file: - with open(config.test_sentences_file, "r") as f: - test_sentences = [s.strip() for s in f.readlines()] - else: - test_sentences = [ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963.", - ] - - # test sentences - test_audios = {} - test_figures = {} - print(" | > Synthesizing test sentences") - if config.use_speaker_embedding: - if config.use_external_speaker_embedding_file: - speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[randrange(len(speaker_mapping) - 1)]][ - "embedding" - ] - speaker_id = None - else: - speaker_id = 0 - speaker_embedding = None - else: - speaker_id = None - speaker_embedding = None - - style_wav = config.style_wav_for_test - for idx, test_sentence in enumerate(test_sentences): - try: - wav, alignment, _, postnet_output, _, _ = synthesis( - model, - test_sentence, - config, - use_cuda, - ap, - speaker_id=speaker_id, - speaker_embedding=speaker_embedding, - style_wav=style_wav, - truncated=False, - enable_eos_bos_chars=config.enable_eos_bos_chars, # pylint: disable=unused-argument - use_griffin_lim=True, - do_trim_silence=False, - ) - - file_path = os.path.join(AUDIO_PATH, str(global_step)) - os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) - ap.save_wav(wav, file_path) - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-prediction".format(idx)] = plot_spectrogram(postnet_output, ap) - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment) - except: # pylint: disable=bare-except - print(" !! Error creating Test Sentence -", idx) - traceback.print_exc() - tb_logger.tb_test_audios(global_step, test_audios, config.audio["sample_rate"]) - tb_logger.tb_test_figures(global_step, test_figures) - return keep_avg.avg_values - - -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global meta_data_train, meta_data_eval, symbols, phonemes, model_characters, speaker_mapping - # Audio processor - ap = AudioProcessor(**config.audio.to_dict()) - if config.has("characters") and config.characters: - symbols, phonemes = make_symbols(**config.characters.to_dict()) - - # DISTRUBUTED - if num_gpus > 1: - init_distributed(args.rank, num_gpus, args.group_id, config.distributed["backend"], config.distributed["url"]) - - # set model characters - model_characters = phonemes if config.use_phonemes else symbols - num_chars = len(model_characters) - - # load data instances - meta_data_train, meta_data_eval = load_meta_data(config.datasets) - - # parse speakers - num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(config, args, meta_data_train, OUT_PATH) - - # setup model - model = setup_model(num_chars, num_speakers, config, speaker_embedding_dim=speaker_embedding_dim) - optimizer = RAdam(model.parameters(), lr=config.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9) - criterion = GlowTTSLoss() - - if args.restore_path: - print(f" > Restoring from {os.path.basename(args.restore_path)} ...") - checkpoint = torch.load(args.restore_path, map_location="cpu") - try: - # TODO: fix optimizer init, model.cuda() needs to be called before - # optimizer restore - optimizer.load_state_dict(checkpoint["optimizer"]) - model.load_state_dict(checkpoint["model"]) - except: # pylint: disable=bare-except - print(" > Partial model initialization.") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], config) - model.load_state_dict(model_dict) - del model_dict - - for group in optimizer.param_groups: - group["initial_lr"] = config.lr - print(f" > Model restored from step {checkpoint['step']:d}", flush=True) - args.restore_step = checkpoint["step"] - else: - args.restore_step = 0 - - if use_cuda: - model.cuda() - criterion.cuda() - - # DISTRUBUTED - if num_gpus > 1: - model = DDP_th(model, device_ids=[args.rank]) - - if config.noam_schedule: - scheduler = NoamLR(optimizer, warmup_steps=config.warmup_steps, last_epoch=args.restore_step - 1) - else: - scheduler = None - - num_params = count_parameters(model) - print("\n > Model has {} parameters".format(num_params), flush=True) - - if args.restore_step == 0 or not args.best_path: - best_loss = float("inf") - print(" > Starting with inf best loss.") - else: - print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {best_loss}.") - keep_all_best = config.keep_all_best - keep_after = config.keep_after # void if keep_all_best False - - # define dataloaders - train_loader = setup_loader(ap, 1, is_val=False, verbose=True) - eval_loader = setup_loader(ap, 1, is_val=True, verbose=True) - - global_step = args.restore_step - model = data_depended_init(train_loader, model) - for epoch in range(0, config.epochs): - c_logger.print_epoch_start(epoch, config.epochs) - train_avg_loss_dict, global_step = train( - train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch - ) - eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = train_avg_loss_dict["avg_loss"] - if config.run_eval: - target_loss = eval_avg_loss_dict["avg_loss"] - best_loss = save_best_model( - target_loss, - best_loss, - model, - optimizer, - global_step, - epoch, - config.r, - OUT_PATH, - model_characters, - keep_all_best=keep_all_best, - keep_after=keep_after, - ) - - -if __name__ == "__main__": - args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py deleted file mode 100644 index 4dc3f5f0..00000000 --- a/TTS/bin/train_speedy_speech.py +++ /dev/null @@ -1,578 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import os -import sys -import time -import traceback -from random import randrange - -import numpy as np -import torch - -# DISTRIBUTED -from torch.nn.parallel import DistributedDataParallel as DDP_th -from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler - -from TTS.tts.datasets import load_meta_data -from TTS.tts.datasets.TTSDataset import TTSDataset -from TTS.tts.layers.losses import SpeedySpeechLoss -from TTS.tts.models import setup_model -from TTS.tts.utils.io import save_best_model, save_checkpoint -from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.speakers import parse_speakers -from TTS.tts.utils.synthesis import synthesis -from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.arguments import init_training -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import init_distributed, reduce_tensor -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.radam import RAdam -from TTS.utils.training import NoamLR, setup_torch_training_env - -use_cuda, num_gpus = setup_torch_training_env(True, False) - - -def setup_loader(ap, r, is_val=False, verbose=False): - if is_val and not config.run_eval: - loader = None - else: - dataset = TTSDataset( - r, - config.text_cleaner, - compute_linear_spec=False, - meta_data=meta_data_eval if is_val else meta_data_train, - ap=ap, - tp=config.characters, - add_blank=config["add_blank"], - batch_group_size=0 if is_val else config.batch_group_size * config.batch_size, - min_seq_len=config.min_seq_len, - max_seq_len=config.max_seq_len, - phoneme_cache_path=config.phoneme_cache_path, - use_phonemes=config.use_phonemes, - phoneme_language=config.phoneme_language, - enable_eos_bos=config.enable_eos_bos_chars, - use_noise_augment=not is_val, - verbose=verbose, - speaker_mapping=speaker_mapping - if config.use_speaker_embedding and config.use_external_speaker_embedding_file - else None, - ) - - if config.use_phonemes and config.compute_input_seq_cache: - # precompute phonemes to have a better estimate of sequence lengths. - dataset.compute_input_seq(config.num_loader_workers) - dataset.sort_items() - - sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=config.eval_batch_size if is_val else config.batch_size, - shuffle=False, - collate_fn=dataset.collate_fn, - drop_last=False, - sampler=sampler, - num_workers=config.num_val_loader_workers if is_val else config.num_loader_workers, - pin_memory=False, - ) - return loader - - -def format_data(data): - # setup input data - text_input = data[0] - text_lengths = data[1] - speaker_names = data[2] - mel_input = data[4].permute(0, 2, 1) # B x D x T - mel_lengths = data[5] - item_idx = data[7] - attn_mask = data[9] - avg_text_length = torch.mean(text_lengths.float()) - avg_spec_length = torch.mean(mel_lengths.float()) - - if config.use_speaker_embedding: - if config.use_external_speaker_embedding_file: - # return precomputed embedding vector - speaker_c = data[8] - else: - # return speaker_id to be used by an embedding layer - speaker_c = [speaker_mapping[speaker_name] for speaker_name in speaker_names] - speaker_c = torch.LongTensor(speaker_c) - else: - speaker_c = None - # compute durations from attention mask - durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2]) - for idx, am in enumerate(attn_mask): - # compute raw durations - c_idxs = am[:, : text_lengths[idx], : mel_lengths[idx]].max(1)[1] - # c_idxs, counts = torch.unique_consecutive(c_idxs, return_counts=True) - c_idxs, counts = torch.unique(c_idxs, return_counts=True) - dur = torch.ones([text_lengths[idx]]).to(counts.dtype) - dur[c_idxs] = counts - # smooth the durations and set any 0 duration to 1 - # by cutting off from the largest duration indeces. - extra_frames = dur.sum() - mel_lengths[idx] - largest_idxs = torch.argsort(-dur)[:extra_frames] - dur[largest_idxs] -= 1 - assert ( - dur.sum() == mel_lengths[idx] - ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" - durations[idx, : text_lengths[idx]] = dur - # dispatch data to GPU - if use_cuda: - text_input = text_input.cuda(non_blocking=True) - text_lengths = text_lengths.cuda(non_blocking=True) - mel_input = mel_input.cuda(non_blocking=True) - mel_lengths = mel_lengths.cuda(non_blocking=True) - if speaker_c is not None: - speaker_c = speaker_c.cuda(non_blocking=True) - attn_mask = attn_mask.cuda(non_blocking=True) - durations = durations.cuda(non_blocking=True) - return ( - text_input, - text_lengths, - mel_input, - mel_lengths, - speaker_c, - avg_text_length, - avg_spec_length, - attn_mask, - durations, - item_idx, - ) - - -def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch): - - model.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (config.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / config.batch_size) - end_time = time.time() - c_logger.print_train_start() - scaler = torch.cuda.amp.GradScaler() if config.mixed_precision else None - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - ( - text_input, - text_lengths, - mel_targets, - mel_lengths, - speaker_c, - avg_text_length, - avg_spec_length, - _, - dur_target, - _, - ) = format_data(data) - - loader_time = time.time() - end_time - - global_step += 1 - optimizer.zero_grad() - - # forward pass model - with torch.cuda.amp.autocast(enabled=config.mixed_precision): - decoder_output, dur_output, alignments = model.forward( - text_input, text_lengths, mel_lengths, dur_target, g=speaker_c - ) - - # compute loss - loss_dict = criterion( - decoder_output, mel_targets, mel_lengths, dur_output, torch.log(1 + dur_target), text_lengths - ) - - # backward pass with loss scaling - if config.mixed_precision: - scaler.scale(loss_dict["loss"]).backward() - scaler.unscale_(optimizer) - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) - scaler.step(optimizer) - scaler.update() - else: - loss_dict["loss"].backward() - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) - optimizer.step() - - # setup lr - if config.noam_schedule: - scheduler.step() - - # current_lr - current_lr = optimizer.param_groups[0]["lr"] - - # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(alignments, binary=True) - loss_dict["align_error"] = align_error - - step_time = time.time() - start_time - epoch_time += step_time - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["loss_l1"] = reduce_tensor(loss_dict["loss_l1"].data, num_gpus) - loss_dict["loss_ssim"] = reduce_tensor(loss_dict["loss_ssim"].data, num_gpus) - loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - keep_avg.update_values(update_train_values) - - # print training progress - if global_step % config.print_step == 0: - log_dict = { - "avg_spec_length": [avg_spec_length, 1], # value, precision - "avg_text_length": [avg_text_length, 1], - "step_time": [step_time, 4], - "loader_time": [loader_time, 2], - "current_lr": current_lr, - } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Plot Training Iter Stats - # reduce TB load - if global_step % config.tb_plot_step == 0: - iter_stats = {"lr": current_lr, "grad_norm": grad_norm, "step_time": step_time} - iter_stats.update(loss_dict) - tb_logger.tb_train_step_stats(global_step, iter_stats) - - if global_step % config.save_step == 0: - if config.checkpoint: - # save model - save_checkpoint( - model, - optimizer, - global_step, - epoch, - 1, - OUT_PATH, - model_characters, - model_loss=loss_dict["loss"], - ) - - # wait all kernels to be completed - torch.cuda.synchronize() - - # Diagnostic visualizations - idx = np.random.randint(mel_targets.shape[0]) - pred_spec = decoder_output[idx].detach().data.cpu().numpy().T - gt_spec = mel_targets[idx].data.cpu().numpy().T - align_img = alignments[idx].data.cpu() - - figures = { - "prediction": plot_spectrogram(pred_spec, ap), - "ground_truth": plot_spectrogram(gt_spec, ap), - "alignment": plot_alignment(align_img), - } - - tb_logger.tb_train_figures(global_step, figures) - - # Sample audio - train_audio = ap.inv_melspectrogram(pred_spec.T) - tb_logger.tb_train_audios(global_step, {"TrainAudio": train_audio}, config.audio["sample_rate"]) - end_time = time.time() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Epoch Stats - if args.rank == 0: - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - if config.tb_model_param_stats: - tb_logger.tb_model_weights(model, global_step) - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(data_loader, model, criterion, ap, global_step, epoch): - model.eval() - epoch_time = 0 - keep_avg = KeepAverage() - c_logger.print_eval_start() - if data_loader is not None: - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - text_input, text_lengths, mel_targets, mel_lengths, speaker_c, _, _, _, dur_target, _ = format_data(data) - - # forward pass model - with torch.cuda.amp.autocast(enabled=config.mixed_precision): - decoder_output, dur_output, alignments = model.forward( - text_input, text_lengths, mel_lengths, dur_target, g=speaker_c - ) - - # compute loss - loss_dict = criterion( - decoder_output, mel_targets, mel_lengths, dur_output, torch.log(1 + dur_target), text_lengths - ) - - # step time - step_time = time.time() - start_time - epoch_time += step_time - - # compute alignment score - align_error = 1 - alignment_diagonal_score(alignments, binary=True) - loss_dict["align_error"] = align_error - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["loss_l1"] = reduce_tensor(loss_dict["loss_l1"].data, num_gpus) - loss_dict["loss_ssim"] = reduce_tensor(loss_dict["loss_ssim"].data, num_gpus) - loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - keep_avg.update_values(update_train_values) - - if config.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Diagnostic visualizations - idx = np.random.randint(mel_targets.shape[0]) - pred_spec = decoder_output[idx].detach().data.cpu().numpy().T - gt_spec = mel_targets[idx].data.cpu().numpy().T - align_img = alignments[idx].data.cpu() - - eval_figures = { - "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), - "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), - "alignment": plot_alignment(align_img, output_fig=False), - } - - # Sample audio - eval_audio = ap.inv_melspectrogram(pred_spec.T) - tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, config.audio["sample_rate"]) - - # Plot Validation Stats - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - tb_logger.tb_eval_figures(global_step, eval_figures) - - if args.rank == 0 and epoch >= config.test_delay_epochs: - if config.test_sentences_file: - with open(config.test_sentences_file, "r") as f: - test_sentences = [s.strip() for s in f.readlines()] - else: - test_sentences = [ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963.", - ] - - # test sentences - test_audios = {} - test_figures = {} - print(" | > Synthesizing test sentences") - if config.use_speaker_embedding: - if config.use_external_speaker_embedding_file: - speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[randrange(len(speaker_mapping) - 1)]][ - "embedding" - ] - speaker_id = None - else: - speaker_id = 0 - speaker_embedding = None - else: - speaker_id = None - speaker_embedding = None - - for idx, test_sentence in enumerate(test_sentences): - try: - wav, alignment, _, postnet_output, _, _ = synthesis( - model, - test_sentence, - config, - use_cuda, - ap, - speaker_id=speaker_id, - speaker_embedding=speaker_embedding, - style_wav=None, - truncated=False, - enable_eos_bos_chars=config.enable_eos_bos_chars, # pylint: disable=unused-argument - use_griffin_lim=True, - do_trim_silence=False, - ) - - file_path = os.path.join(AUDIO_PATH, str(global_step)) - os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) - ap.save_wav(wav, file_path) - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-prediction".format(idx)] = plot_spectrogram(postnet_output, ap) - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment) - except: # pylint: disable=bare-except - print(" !! Error creating Test Sentence -", idx) - traceback.print_exc() - tb_logger.tb_test_audios(global_step, test_audios, config.audio["sample_rate"]) - tb_logger.tb_test_figures(global_step, test_figures) - return keep_avg.avg_values - - -# FIXME: move args definition/parsing inside of main? -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global meta_data_train, meta_data_eval, symbols, phonemes, model_characters, speaker_mapping - # Audio processor - ap = AudioProcessor(**config.audio.to_dict()) - if config.characters is not None: - symbols, phonemes = make_symbols(**config.characters.to_dict()) - - # DISTRUBUTED - if num_gpus > 1: - init_distributed(args.rank, num_gpus, args.group_id, config.distributed["backend"], config.distributed["url"]) - - # set model characters - model_characters = phonemes if config.use_phonemes else symbols - num_chars = len(model_characters) - - # load data instances - meta_data_train, meta_data_eval = load_meta_data(config.datasets, eval_split=True) - - # set the portion of the data used for training if set in config.json - if config.has("train_portion"): - meta_data_train = meta_data_train[: int(len(meta_data_train) * config.train_portion)] - if config.has("eval_portion"): - meta_data_eval = meta_data_eval[: int(len(meta_data_eval) * config.eval_portion)] - - # parse speakers - num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(config, args, meta_data_train, OUT_PATH) - - # setup model - model = setup_model(num_chars, num_speakers, config, speaker_embedding_dim=speaker_embedding_dim) - optimizer = RAdam(model.parameters(), lr=config.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9) - criterion = SpeedySpeechLoss(config) - - if args.restore_path: - print(f" > Restoring from {os.path.basename(args.restore_path)} ...") - checkpoint = torch.load(args.restore_path, map_location="cpu") - try: - # TODO: fix optimizer init, model.cuda() needs to be called before - # optimizer restore - optimizer.load_state_dict(checkpoint["optimizer"]) - if config.reinit_layers: - raise RuntimeError - model.load_state_dict(checkpoint["model"]) - except: # pylint: disable=bare-except - print(" > Partial model initialization.") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], config) - model.load_state_dict(model_dict) - del model_dict - - for group in optimizer.param_groups: - group["initial_lr"] = config.lr - print(" > Model restored from step %d" % checkpoint["step"], flush=True) - args.restore_step = checkpoint["step"] - else: - args.restore_step = 0 - - if use_cuda: - model.cuda() - criterion.cuda() - - # DISTRUBUTED - if num_gpus > 1: - model = DDP_th(model, device_ids=[args.rank]) - - if config.noam_schedule: - scheduler = NoamLR(optimizer, warmup_steps=config.warmup_steps, last_epoch=args.restore_step - 1) - else: - scheduler = None - - num_params = count_parameters(model) - print("\n > Model has {} parameters".format(num_params), flush=True) - - if args.restore_step == 0 or not args.best_path: - best_loss = float("inf") - print(" > Starting with inf best loss.") - else: - print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {best_loss}.") - keep_all_best = config.keep_all_best - keep_after = config.keep_after # void if keep_all_best False - - # define dataloaders - train_loader = setup_loader(ap, 1, is_val=False, verbose=True) - eval_loader = setup_loader(ap, 1, is_val=True, verbose=True) - - global_step = args.restore_step - for epoch in range(0, config.epochs): - c_logger.print_epoch_start(epoch, config.epochs) - train_avg_loss_dict, global_step = train( - train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch - ) - eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = train_avg_loss_dict["avg_loss"] - if config.run_eval: - target_loss = eval_avg_loss_dict["avg_loss"] - best_loss = save_best_model( - target_loss, - best_loss, - model, - optimizer, - global_step, - epoch, - config.r, - OUT_PATH, - model_characters, - keep_all_best=keep_all_best, - keep_after=keep_after, - ) - - -if __name__ == "__main__": - args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py deleted file mode 100755 index 69ffbb6c..00000000 --- a/TTS/bin/train_tacotron.py +++ /dev/null @@ -1,749 +0,0 @@ -#!/usr/bin/env python3 -"""Trains Tacotron based TTS models.""" - -import os -import sys -import time -import traceback -from random import randrange - -import numpy as np -import torch -from torch.utils.data import DataLoader - -from TTS.tts.datasets import load_meta_data -from TTS.tts.datasets.TTSDataset import TTSDataset -from TTS.tts.layers.losses import TacotronLoss -from TTS.tts.models import setup_model -from TTS.tts.utils.io import save_best_model, save_checkpoint -from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.speakers import parse_speakers -from TTS.tts.utils.synthesis import synthesis -from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.arguments import init_training -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import DistributedSampler, apply_gradient_allreduce, init_distributed, reduce_tensor -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.radam import RAdam -from TTS.utils.training import ( - NoamLR, - adam_weight_decay, - check_update, - gradual_training_scheduler, - set_weight_decay, - setup_torch_training_env, -) - -use_cuda, num_gpus = setup_torch_training_env(True, False) - - -def setup_loader(ap, r, is_val=False, verbose=False, dataset=None): - if is_val and not config.run_eval: - loader = None - else: - if dataset is None: - dataset = TTSDataset( - r, - config.text_cleaner, - compute_linear_spec=config.model.lower() == "tacotron", - meta_data=meta_data_eval if is_val else meta_data_train, - ap=ap, - tp=config.characters, - add_blank=config["add_blank"], - batch_group_size=0 if is_val else config.batch_group_size * config.batch_size, - min_seq_len=config.min_seq_len, - max_seq_len=config.max_seq_len, - phoneme_cache_path=config.phoneme_cache_path, - use_phonemes=config.use_phonemes, - phoneme_language=config.phoneme_language, - enable_eos_bos=config.enable_eos_bos_chars, - verbose=verbose, - speaker_mapping=( - speaker_mapping - if (config.use_speaker_embedding and config.use_external_speaker_embedding_file) - else None - ), - ) - - if config.use_phonemes and config.compute_input_seq_cache: - # precompute phonemes to have a better estimate of sequence lengths. - dataset.compute_input_seq(config.num_loader_workers) - dataset.sort_items() - - sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=config.eval_batch_size if is_val else config.batch_size, - shuffle=False, - collate_fn=dataset.collate_fn, - drop_last=False, - sampler=sampler, - num_workers=config.num_val_loader_workers if is_val else config.num_loader_workers, - pin_memory=False, - ) - return loader - - -def format_data(data): - # setup input data - text_input = data[0] - text_lengths = data[1] - speaker_names = data[2] - linear_input = data[3] if config.model.lower() in ["tacotron"] else None - mel_input = data[4] - mel_lengths = data[5] - stop_targets = data[6] - max_text_length = torch.max(text_lengths.float()) - max_spec_length = torch.max(mel_lengths.float()) - - if config.use_speaker_embedding: - if config.use_external_speaker_embedding_file: - speaker_embeddings = data[8] - speaker_ids = None - else: - speaker_ids = [speaker_mapping[speaker_name] for speaker_name in speaker_names] - speaker_ids = torch.LongTensor(speaker_ids) - speaker_embeddings = None - else: - speaker_embeddings = None - speaker_ids = None - - # set stop targets view, we predict a single stop token per iteration. - stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // config.r, -1) - stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) - - # dispatch data to GPU - if use_cuda: - text_input = text_input.cuda(non_blocking=True) - text_lengths = text_lengths.cuda(non_blocking=True) - mel_input = mel_input.cuda(non_blocking=True) - mel_lengths = mel_lengths.cuda(non_blocking=True) - linear_input = linear_input.cuda(non_blocking=True) if config.model.lower() in ["tacotron"] else None - stop_targets = stop_targets.cuda(non_blocking=True) - if speaker_ids is not None: - speaker_ids = speaker_ids.cuda(non_blocking=True) - if speaker_embeddings is not None: - speaker_embeddings = speaker_embeddings.cuda(non_blocking=True) - - return ( - text_input, - text_lengths, - mel_input, - mel_lengths, - linear_input, - stop_targets, - speaker_ids, - speaker_embeddings, - max_text_length, - max_spec_length, - ) - - -def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler, ap, global_step, epoch, scaler, scaler_st): - model.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (config.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / config.batch_size) - end_time = time.time() - c_logger.print_train_start() - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - ( - text_input, - text_lengths, - mel_input, - mel_lengths, - linear_input, - stop_targets, - speaker_ids, - speaker_embeddings, - max_text_length, - max_spec_length, - ) = format_data(data) - loader_time = time.time() - end_time - - global_step += 1 - - # setup lr - if config.noam_schedule: - scheduler.step() - - optimizer.zero_grad() - if optimizer_st: - optimizer_st.zero_grad() - - with torch.cuda.amp.autocast(enabled=config.mixed_precision): - # forward pass model - if config.bidirectional_decoder or config.double_decoder_consistency: - ( - decoder_output, - postnet_output, - alignments, - stop_tokens, - decoder_backward_output, - alignments_backward, - ) = model( - text_input, - text_lengths, - mel_input, - mel_lengths, - speaker_ids=speaker_ids, - speaker_embeddings=speaker_embeddings, - ) - else: - decoder_output, postnet_output, alignments, stop_tokens = model( - text_input, - text_lengths, - mel_input, - mel_lengths, - speaker_ids=speaker_ids, - speaker_embeddings=speaker_embeddings, - ) - decoder_backward_output = None - alignments_backward = None - - # set the [alignment] lengths wrt reduction factor for guided attention - if mel_lengths.max() % model.decoder.r != 0: - alignment_lengths = ( - mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r)) - ) // model.decoder.r - else: - alignment_lengths = mel_lengths // model.decoder.r - - # compute loss - loss_dict = criterion( - postnet_output, - decoder_output, - mel_input, - linear_input, - stop_tokens, - stop_targets, - mel_lengths, - decoder_backward_output, - alignments, - alignment_lengths, - alignments_backward, - text_lengths, - ) - - # check nan loss - if torch.isnan(loss_dict["loss"]).any(): - raise RuntimeError(f"Detected NaN loss at step {global_step}.") - - # optimizer step - if config.mixed_precision: - # model optimizer step in mixed precision mode - scaler.scale(loss_dict["loss"]).backward() - scaler.unscale_(optimizer) - optimizer, current_lr = adam_weight_decay(optimizer) - grad_norm, _ = check_update(model, config.grad_clip, ignore_stopnet=True) - scaler.step(optimizer) - scaler.update() - - # stopnet optimizer step - if config.separate_stopnet: - scaler_st.scale(loss_dict["stopnet_loss"]).backward() - scaler.unscale_(optimizer_st) - optimizer_st, _ = adam_weight_decay(optimizer_st) - grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) - scaler_st.step(optimizer) - scaler_st.update() - else: - grad_norm_st = 0 - else: - # main model optimizer step - loss_dict["loss"].backward() - optimizer, current_lr = adam_weight_decay(optimizer) - grad_norm, _ = check_update(model, config.grad_clip, ignore_stopnet=True) - optimizer.step() - - # stopnet optimizer step - if config.separate_stopnet: - loss_dict["stopnet_loss"].backward() - optimizer_st, _ = adam_weight_decay(optimizer_st) - grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) - optimizer_st.step() - else: - grad_norm_st = 0 - - # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(alignments) - loss_dict["align_error"] = align_error - - step_time = time.time() - start_time - epoch_time += step_time - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["postnet_loss"] = reduce_tensor(loss_dict["postnet_loss"].data, num_gpus) - loss_dict["decoder_loss"] = reduce_tensor(loss_dict["decoder_loss"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) - loss_dict["stopnet_loss"] = ( - reduce_tensor(loss_dict["stopnet_loss"].data, num_gpus) if config.stopnet else loss_dict["stopnet_loss"] - ) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - keep_avg.update_values(update_train_values) - - # print training progress - if global_step % config.print_step == 0: - log_dict = { - "max_spec_length": [max_spec_length, 1], # value, precision - "max_text_length": [max_text_length, 1], - "step_time": [step_time, 4], - "loader_time": [loader_time, 2], - "current_lr": current_lr, - } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Plot Training Iter Stats - # reduce TB load - if global_step % config.tb_plot_step == 0: - iter_stats = { - "lr": current_lr, - "grad_norm": grad_norm, - "grad_norm_st": grad_norm_st, - "step_time": step_time, - } - iter_stats.update(loss_dict) - tb_logger.tb_train_step_stats(global_step, iter_stats) - - if global_step % config.save_step == 0: - if config.checkpoint: - # save model - save_checkpoint( - model, - optimizer, - global_step, - epoch, - model.decoder.r, - OUT_PATH, - optimizer_st=optimizer_st, - model_loss=loss_dict["postnet_loss"], - characters=model_characters, - scaler=scaler.state_dict() if config.mixed_precision else None, - ) - - # Diagnostic visualizations - const_spec = postnet_output[0].data.cpu().numpy() - gt_spec = ( - linear_input[0].data.cpu().numpy() - if config.model in ["Tacotron", "TacotronGST"] - else mel_input[0].data.cpu().numpy() - ) - align_img = alignments[0].data.cpu().numpy() - - figures = { - "prediction": plot_spectrogram(const_spec, ap, output_fig=False), - "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), - "alignment": plot_alignment(align_img, output_fig=False), - } - - if config.bidirectional_decoder or config.double_decoder_consistency: - figures["alignment_backward"] = plot_alignment( - alignments_backward[0].data.cpu().numpy(), output_fig=False - ) - - tb_logger.tb_train_figures(global_step, figures) - - # Sample audio - if config.model in ["Tacotron", "TacotronGST"]: - train_audio = ap.inv_spectrogram(const_spec.T) - else: - train_audio = ap.inv_melspectrogram(const_spec.T) - tb_logger.tb_train_audios(global_step, {"TrainAudio": train_audio}, config.audio["sample_rate"]) - end_time = time.time() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Epoch Stats - if args.rank == 0: - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - if config.tb_model_param_stats: - tb_logger.tb_model_weights(model, global_step) - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(data_loader, model, criterion, ap, global_step, epoch): - model.eval() - epoch_time = 0 - keep_avg = KeepAverage() - c_logger.print_eval_start() - if data_loader is not None: - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - ( - text_input, - text_lengths, - mel_input, - mel_lengths, - linear_input, - stop_targets, - speaker_ids, - speaker_embeddings, - _, - _, - ) = format_data(data) - assert mel_input.shape[1] % model.decoder.r == 0 - - # forward pass model - if config.bidirectional_decoder or config.double_decoder_consistency: - ( - decoder_output, - postnet_output, - alignments, - stop_tokens, - decoder_backward_output, - alignments_backward, - ) = model( - text_input, text_lengths, mel_input, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings - ) - else: - decoder_output, postnet_output, alignments, stop_tokens = model( - text_input, text_lengths, mel_input, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings - ) - decoder_backward_output = None - alignments_backward = None - - # set the alignment lengths wrt reduction factor for guided attention - if mel_lengths.max() % model.decoder.r != 0: - alignment_lengths = ( - mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r)) - ) // model.decoder.r - else: - alignment_lengths = mel_lengths // model.decoder.r - - # compute loss - loss_dict = criterion( - postnet_output, - decoder_output, - mel_input, - linear_input, - stop_tokens, - stop_targets, - mel_lengths, - decoder_backward_output, - alignments, - alignment_lengths, - alignments_backward, - text_lengths, - ) - - # step time - step_time = time.time() - start_time - epoch_time += step_time - - # compute alignment score - align_error = 1 - alignment_diagonal_score(alignments) - loss_dict["align_error"] = align_error - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["postnet_loss"] = reduce_tensor(loss_dict["postnet_loss"].data, num_gpus) - loss_dict["decoder_loss"] = reduce_tensor(loss_dict["decoder_loss"].data, num_gpus) - if config.stopnet: - loss_dict["stopnet_loss"] = reduce_tensor(loss_dict["stopnet_loss"].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - keep_avg.update_values(update_train_values) - - if config.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Diagnostic visualizations - idx = np.random.randint(mel_input.shape[0]) - const_spec = postnet_output[idx].data.cpu().numpy() - gt_spec = ( - linear_input[idx].data.cpu().numpy() - if config.model in ["Tacotron", "TacotronGST"] - else mel_input[idx].data.cpu().numpy() - ) - align_img = alignments[idx].data.cpu().numpy() - - eval_figures = { - "prediction": plot_spectrogram(const_spec, ap, output_fig=False), - "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), - "alignment": plot_alignment(align_img, output_fig=False), - } - - # Sample audio - if config.model.lower() in ["tacotron"]: - eval_audio = ap.inv_spectrogram(const_spec.T) - else: - eval_audio = ap.inv_melspectrogram(const_spec.T) - tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, config.audio["sample_rate"]) - - # Plot Validation Stats - - if config.bidirectional_decoder or config.double_decoder_consistency: - align_b_img = alignments_backward[idx].data.cpu().numpy() - eval_figures["alignment2"] = plot_alignment(align_b_img, output_fig=False) - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - tb_logger.tb_eval_figures(global_step, eval_figures) - - if args.rank == 0 and epoch > config.test_delay_epochs: - if config.test_sentences_file: - with open(config.test_sentences_file, "r") as f: - test_sentences = [s.strip() for s in f.readlines()] - else: - test_sentences = [ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963.", - ] - - # test sentences - test_audios = {} - test_figures = {} - print(" | > Synthesizing test sentences") - speaker_id = 0 if config.use_speaker_embedding else None - speaker_embedding = ( - speaker_mapping[list(speaker_mapping.keys())[randrange(len(speaker_mapping) - 1)]]["embedding"] - if config.use_external_speaker_embedding_file and config.use_speaker_embedding - else None - ) - style_wav = config.gst_style_input - if style_wav is None and config.gst is not None: - # inicialize GST with zero dict. - style_wav = {} - print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") - for i in range(config.gst["gst_num_style_tokens"]): - style_wav[str(i)] = 0 - for idx, test_sentence in enumerate(test_sentences): - try: - wav, alignment, decoder_output, postnet_output, stop_tokens, _ = synthesis( - model, - test_sentence, - config, - use_cuda, - ap, - speaker_id=speaker_id, - speaker_embedding=speaker_embedding, - style_wav=style_wav, - truncated=False, - enable_eos_bos_chars=config.enable_eos_bos_chars, # pylint: disable=unused-argument - use_griffin_lim=True, - do_trim_silence=False, - ) - - file_path = os.path.join(AUDIO_PATH, str(global_step)) - os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) - ap.save_wav(wav, file_path) - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-prediction".format(idx)] = plot_spectrogram(postnet_output, ap, output_fig=False) - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment, output_fig=False) - except: # pylint: disable=bare-except - print(" !! Error creating Test Sentence -", idx) - traceback.print_exc() - tb_logger.tb_test_audios(global_step, test_audios, config.audio["sample_rate"]) - tb_logger.tb_test_figures(global_step, test_figures) - return keep_avg.avg_values - - -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global meta_data_train, meta_data_eval, speaker_mapping, symbols, phonemes, model_characters - # Audio processor - ap = AudioProcessor(**config.audio.to_dict()) - - # setup custom characters if set in config file. - if config.characters is not None: - symbols, phonemes = make_symbols(**config.characters.to_dict()) - - # DISTRUBUTED - if num_gpus > 1: - init_distributed(args.rank, num_gpus, args.group_id, config.distributed["backend"], config.distributed["url"]) - num_chars = len(phonemes) if config.use_phonemes else len(symbols) - model_characters = phonemes if config.use_phonemes else symbols - - # load data instances - meta_data_train, meta_data_eval = load_meta_data(config.datasets) - - # set the portion of the data used for training - if config.has("train_portion"): - meta_data_train = meta_data_train[: int(len(meta_data_train) * config.train_portion)] - if config.has("eval_portion"): - meta_data_eval = meta_data_eval[: int(len(meta_data_eval) * config.eval_portion)] - - # parse speakers - num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(config, args, meta_data_train, OUT_PATH) - - model = setup_model(num_chars, num_speakers, config, speaker_embedding_dim) - - # scalers for mixed precision training - scaler = torch.cuda.amp.GradScaler() if config.mixed_precision else None - scaler_st = torch.cuda.amp.GradScaler() if config.mixed_precision and config.separate_stopnet else None - - params = set_weight_decay(model, config.wd) - optimizer = RAdam(params, lr=config.lr, weight_decay=0) - if config.stopnet and config.separate_stopnet: - optimizer_st = RAdam(model.decoder.stopnet.parameters(), lr=config.lr, weight_decay=0) - else: - optimizer_st = None - - # setup criterion - criterion = TacotronLoss(config, stopnet_pos_weight=config.stopnet_pos_weight, ga_sigma=0.4) - if args.restore_path: - print(f" > Restoring from {os.path.basename(args.restore_path)}...") - checkpoint = torch.load(args.restore_path, map_location="cpu") - try: - print(" > Restoring Model...") - model.load_state_dict(checkpoint["model"]) - # optimizer restore - print(" > Restoring Optimizer...") - optimizer.load_state_dict(checkpoint["optimizer"]) - if "scaler" in checkpoint and config.mixed_precision: - print(" > Restoring AMP Scaler...") - scaler.load_state_dict(checkpoint["scaler"]) - except (KeyError, RuntimeError): - print(" > Partial model initialization...") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], config) - model.load_state_dict(model_dict) - del model_dict - - for group in optimizer.param_groups: - group["lr"] = config.lr - print(" > Model restored from step %d" % checkpoint["step"], flush=True) - args.restore_step = checkpoint["step"] - else: - args.restore_step = 0 - - if use_cuda: - model.cuda() - criterion.cuda() - - # DISTRUBUTED - if num_gpus > 1: - model = apply_gradient_allreduce(model) - - if config.noam_schedule: - scheduler = NoamLR(optimizer, warmup_steps=config.warmup_steps, last_epoch=args.restore_step - 1) - else: - scheduler = None - - num_params = count_parameters(model) - print("\n > Model has {} parameters".format(num_params), flush=True) - - if args.restore_step == 0 or not args.best_path: - best_loss = float("inf") - print(" > Starting with inf best loss.") - else: - print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {best_loss}.") - keep_all_best = config.keep_all_best - keep_after = config.keep_after # void if keep_all_best False - - # define data loaders - train_loader = setup_loader(ap, model.decoder.r, is_val=False, verbose=True) - eval_loader = setup_loader(ap, model.decoder.r, is_val=True) - - global_step = args.restore_step - for epoch in range(0, config.epochs): - c_logger.print_epoch_start(epoch, config.epochs) - # set gradual training - if config.gradual_training is not None: - r, config.batch_size = gradual_training_scheduler(global_step, config) - config.r = r - model.decoder.set_r(r) - if config.bidirectional_decoder: - model.decoder_backward.set_r(r) - train_loader.dataset.outputs_per_step = r - eval_loader.dataset.outputs_per_step = r - train_loader = setup_loader(ap, model.decoder.r, is_val=False, dataset=train_loader.dataset) - eval_loader = setup_loader(ap, model.decoder.r, is_val=True, dataset=eval_loader.dataset) - print("\n > Number of output frames:", model.decoder.r) - # train one epoch - train_avg_loss_dict, global_step = train( - train_loader, - model, - criterion, - optimizer, - optimizer_st, - scheduler, - ap, - global_step, - epoch, - scaler, - scaler_st, - ) - # eval one epoch - eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = train_avg_loss_dict["avg_postnet_loss"] - if config.run_eval: - target_loss = eval_avg_loss_dict["avg_postnet_loss"] - best_loss = save_best_model( - target_loss, - best_loss, - model, - optimizer, - global_step, - epoch, - config.r, - OUT_PATH, - model_characters, - keep_all_best=keep_all_best, - keep_after=keep_after, - scaler=scaler.state_dict() if config.mixed_precision else None, - ) - - -if __name__ == "__main__": - args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) From 421194880d4dcb7121e79ce00996e230549b93a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 May 2021 14:04:51 +0200 Subject: [PATCH 157/258] linter fixes --- TTS/bin/train_tts.py | 28 +++++------ TTS/trainer.py | 6 ++- TTS/tts/models/align_tts.py | 4 +- TTS/tts/models/glow_tts.py | 6 ++- TTS/tts/models/speedy_speech.py | 4 +- TTS/tts/models/tacotron.py | 2 +- TTS/tts/models/tacotron2.py | 3 +- TTS/tts/models/tacotron_abstract.py | 5 +- TTS/tts/utils/speakers.py | 74 ----------------------------- TTS/tts/utils/synthesis.py | 7 +-- 10 files changed, 34 insertions(+), 105 deletions(-) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 8182b23f..3270d0e0 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -8,20 +8,20 @@ from TTS.utils.generic_utils import remove_experiment_folder def main(): - # try: - args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - trainer = TrainerTTS(args, config, c_logger, tb_logger, output_path=OUT_PATH) - trainer.fit() - # except KeyboardInterrupt: - # remove_experiment_folder(OUT_PATH) - # try: - # sys.exit(0) - # except SystemExit: - # os._exit(0) # pylint: disable=protected-access - # except Exception: # pylint: disable=broad-except - # remove_experiment_folder(OUT_PATH) - # traceback.print_exc() - # sys.exit(1) + try: + args, config, output_path, _, c_logger, tb_logger = init_training(sys.argv) + trainer = TrainerTTS(args, config, c_logger, tb_logger, output_path=output_path) + trainer.fit() + except KeyboardInterrupt: + remove_experiment_folder(output_path) + try: + sys.exit(0) + except SystemExit: + os._exit(0) # pylint: disable=protected-access + except Exception: # pylint: disable=broad-except + remove_experiment_folder(output_path) + traceback.print_exc() + sys.exit(1) if __name__ == "__main__": diff --git a/TTS/trainer.py b/TTS/trainer.py index cb905d3a..34d73874 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -184,7 +184,7 @@ class TrainerTTS: @staticmethod def get_speaker_manager( - config: Coqpit, restore_path: str = "", out_path: str = "", data_train: List = [] + config: Coqpit, restore_path: str = "", out_path: str = "", data_train: List = None ) -> SpeakerManager: speaker_manager = SpeakerManager() if restore_path: @@ -208,7 +208,9 @@ class TrainerTTS: return speaker_manager @staticmethod - def get_scheduler(config: Coqpit, optimizer: torch.optim.Optimizer) -> torch.optim.lr_scheduler._LRScheduler: + def get_scheduler( + config: Coqpit, optimizer: torch.optim.Optimizer + ) -> torch.optim.lr_scheduler._LRScheduler: # pylint: disable=protected-access lr_scheduler = config.lr_scheduler lr_scheduler_params = config.lr_scheduler_params if lr_scheduler is None: diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index e8f80251..6efa64e2 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -275,7 +275,7 @@ class AlignTTS(nn.Module): g: [B, C] """ g = cond_input["x_vectors"] if "x_vectors" in cond_input else None - x_lengths = torch.tensor(x.shape[1:2]).to(x.device) + x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # pylint: disable=not-callable # pad input to prevent dropping the last word # x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0) o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) @@ -314,7 +314,7 @@ class AlignTTS(nn.Module): loss_dict["align_error"] = align_error return outputs, loss_dict - def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): + def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): # pylint: disable=no-self-use model_outputs = outputs["model_outputs"] alignments = outputs["alignments"] mel_input = batch["mel_input"] diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 8cf19f79..9f20f6bb 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -143,7 +143,9 @@ class GlowTTS(nn.Module): o_attn_dur = torch.log(1 + torch.sum(attn, -1)) * x_mask return y_mean, y_log_scale, o_attn_dur - def forward(self, x, x_lengths, y, y_lengths=None, cond_input={"x_vectors": None}): + def forward( + self, x, x_lengths, y, y_lengths=None, cond_input={"x_vectors": None} + ): # pylint: disable=dangerous-default-value """ Shapes: x: [B, T] @@ -344,7 +346,7 @@ class GlowTTS(nn.Module): loss_dict["align_error"] = align_error return outputs, loss_dict - def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): + def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): # pylint: disable=no-self-use model_outputs = outputs["model_outputs"] alignments = outputs["alignments"] mel_input = batch["mel_input"] diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index f00af9ad..96ef1740 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -183,7 +183,7 @@ class SpeedySpeech(nn.Module): g: [B, C] """ g = cond_input["x_vectors"] if "x_vectors" in cond_input else None - x_lengths = torch.tensor(x.shape[1:2]).to(x.device) + x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # pylint: disable=not-callable # input sequence should be greated than the max convolution size inference_padding = 5 if x.shape[1] < 13: @@ -226,7 +226,7 @@ class SpeedySpeech(nn.Module): loss_dict["align_error"] = align_error return outputs, loss_dict - def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): + def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): # pylint: disable=no-self-use model_outputs = outputs["model_outputs"] alignments = outputs["alignments"] mel_input = batch["mel_input"] diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 6059a0d2..da574c05 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -79,7 +79,7 @@ class Tacotron(TacotronAbstract): use_gst=False, gst=None, memory_size=5, - gradual_training=[], + gradual_training=None, ): super().__init__( num_chars, diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index b39a9d6f..14a838d7 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -1,5 +1,4 @@ # coding: utf-8 -import numpy as np import torch from torch import nn @@ -77,7 +76,7 @@ class Tacotron2(TacotronAbstract): speaker_embedding_dim=None, use_gst=False, gst=None, - gradual_training=[], + gradual_training=None, ): super().__init__( num_chars, diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py index 49487b67..8eb7bf24 100644 --- a/TTS/tts/models/tacotron_abstract.py +++ b/TTS/tts/models/tacotron_abstract.py @@ -1,5 +1,4 @@ import copy -import logging from abc import ABC, abstractmethod import torch @@ -37,7 +36,7 @@ class TacotronAbstract(ABC, nn.Module): speaker_embedding_dim=None, use_gst=False, gst=None, - gradual_training=[], + gradual_training=None, ): """Abstract Tacotron class""" super().__init__() @@ -239,4 +238,4 @@ class TacotronAbstract(ABC, nn.Module): trainer.model.decoder_backward.set_r(r) trainer.train_loader = trainer.setup_train_dataloader(self.ap, self.model.decoder.r, verbose=True) trainer.eval_loader = trainer.setup_eval_dataloder(self.ap, self.model.decoder.r) - logging.info(f"\n > Number of output frames: {self.decoder.r}") + print(f"\n > Number of output frames: {self.decoder.r}") diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 4bfe8299..3239e9a5 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -1,5 +1,4 @@ import json -import os import random from typing import Any, List, Union @@ -11,79 +10,6 @@ from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.utils.audio import AudioProcessor -def make_speakers_json_path(out_path): - """Returns conventional speakers.json location.""" - return os.path.join(out_path, "speakers.json") - - -def load_speaker_mapping(out_path): - """Loads speaker mapping if already present.""" - if os.path.splitext(out_path)[1] == ".json": - json_file = out_path - else: - json_file = make_speakers_json_path(out_path) - with open(json_file) as f: - return json.load(f) - - -def save_speaker_mapping(out_path, speaker_mapping): - """Saves speaker mapping if not yet present.""" - if out_path is not None: - speakers_json_path = make_speakers_json_path(out_path) - with open(speakers_json_path, "w") as f: - json.dump(speaker_mapping, f, indent=4) - - -def parse_speakers(c, args, meta_data_train, OUT_PATH): - """Returns number of speakers, speaker embedding shape and speaker mapping""" - if c.use_speaker_embedding: - speakers = get_speakers(meta_data_train) - if args.restore_path: - if c.use_external_speaker_embedding_file: # if restore checkpoint and use External Embedding file - prev_out_path = os.path.dirname(args.restore_path) - speaker_mapping = load_speaker_mapping(prev_out_path) - if not speaker_mapping: - print( - "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" - ) - speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) - if not speaker_mapping: - raise RuntimeError( - "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file" - ) - speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"]) - elif ( - not c.use_external_speaker_embedding_file - ): # if restore checkpoint and don't use External Embedding file - prev_out_path = os.path.dirname(args.restore_path) - speaker_mapping = load_speaker_mapping(prev_out_path) - speaker_embedding_dim = None - assert all(speaker in speaker_mapping for speaker in speakers), ( - "As of now you, you cannot " "introduce new speakers to " "a previously trained model." - ) - elif ( - c.use_external_speaker_embedding_file and c.external_speaker_embedding_file - ): # if start new train using External Embedding file - speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) - speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"]) - elif ( - c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file - ): # if start new train using External Embedding file and don't pass external embedding file - raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" - else: # if start new train and don't use External Embedding file - speaker_mapping = {name: i for i, name in enumerate(speakers)} - speaker_embedding_dim = None - save_speaker_mapping(OUT_PATH, speaker_mapping) - num_speakers = len(speaker_mapping) - print(" > Training with {} speakers: {}".format(len(speakers), ", ".join(speakers))) - else: - num_speakers = 0 - speaker_embedding_dim = None - speaker_mapping = None - - return num_speakers, speaker_embedding_dim, speaker_mapping - - class SpeakerManager: """It manages the multi-speaker setup for 🐸TTS models. It loads the speaker files and parses the information in a way that you can query. There are 3 different scenarios considered. diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index d27b9eb0..7328ddae 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -231,15 +231,16 @@ def synthesis( outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, x_vector=x_vector) model_outputs = outputs["model_outputs"] model_outputs = model_outputs[0].data.cpu().numpy() + alignments = outputs["alignments"] elif backend == "tf": decoder_output, postnet_output, alignments, stop_tokens = run_model_tf( model, text_inputs, CONFIG, speaker_id, style_mel ) - model_outputs, decoder_output, alignment, stop_tokens = parse_outputs_tf( + model_outputs, decoder_output, alignments, stop_tokens = parse_outputs_tf( postnet_output, decoder_output, alignments, stop_tokens ) elif backend == "tflite": - decoder_output, postnet_output, alignment, stop_tokens = run_model_tflite( + decoder_output, postnet_output, alignments, stop_tokens = run_model_tflite( model, text_inputs, CONFIG, speaker_id, style_mel ) model_outputs, decoder_output = parse_outputs_tflite(postnet_output, decoder_output) @@ -253,7 +254,7 @@ def synthesis( wav = trim_silence(wav, ap) return_dict = { "wav": wav, - "alignments": outputs["alignments"], + "alignments": alignments, "model_outputs": model_outputs, "text_inputs": text_inputs, } From f840268181b2c3b0570083a322a0ccff3fd37ef2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 May 2021 15:46:28 +0200 Subject: [PATCH 158/258] refactor `SpeakerManager` --- TTS/tts/utils/speakers.py | 199 +++++++++++++++++++++++++++++++++----- 1 file changed, 177 insertions(+), 22 deletions(-) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 3239e9a5..5c10c589 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -1,6 +1,7 @@ import json +import os import random -from typing import Any, List, Union +from typing import Any, Dict, List, Tuple, Union import numpy as np import torch @@ -10,6 +11,71 @@ from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.utils.audio import AudioProcessor +def make_speakers_json_path(out_path): + """Returns conventional speakers.json location.""" + return os.path.join(out_path, "speakers.json") + + +def load_speaker_mapping(out_path): + """Loads speaker mapping if already present.""" + if os.path.splitext(out_path)[1] == ".json": + json_file = out_path + else: + json_file = make_speakers_json_path(out_path) + with open(json_file) as f: + return json.load(f) + + +def save_speaker_mapping(out_path, speaker_mapping): + """Saves speaker mapping if not yet present.""" + if out_path is not None: + speakers_json_path = make_speakers_json_path(out_path) + with open(speakers_json_path, "w") as f: + json.dump(speaker_mapping, f, indent=4) + + +def get_speaker_manager(c, args, meta_data_train): + """Inititalize and return a `SpeakerManager` based on config values""" + speaker_manager = SpeakerManager() + if c.use_speaker_embedding: + speaker_manager.set_speaker_ids_from_data(meta_data_train) + if args.restore_path: + # restoring speaker manager from a previous run. + if c.use_external_speaker_embedding_file: + # restore speaker manager with the embedding file + speakers_file = os.path.dirname(args.restore_path) + if not os.path.exists(speakers_file): + print( + "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" + ) + if not os.path.exists(c.external_speaker_embedding_file): + raise RuntimeError( + "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file" + ) + speaker_manager.load_x_vectors_file(c.external_speaker_embedding_file) + speaker_manager.set_x_vectors_from_file(speakers_file) + elif not c.use_external_speaker_embedding_file: # restor speaker manager with speaker ID file. + speakers_file = os.path.dirname(args.restore_path) + speaker_ids_from_data = speaker_manager.speaker_ids + speaker_manager.set_speaker_ids_from_file(speakers_file) + assert all( + speaker in speaker_manager.speaker_ids for speaker in speaker_ids_from_data + ), " [!] You cannot introduce new speakers to a pre-trained model." + elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: + # new speaker manager with external speaker embeddings. + speaker_manager.set_x_vectors_from_file(c.external_speaker_embedding_file) + elif ( + c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file + ): # new speaker manager with speaker IDs file. + raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" + print( + " > Training with {} speakers: {}".format( + speaker_manager.num_speakers, ", ".join(speaker_manager.speaker_ids) + ) + ) + return speaker_manager + + class SpeakerManager: """It manages the multi-speaker setup for 🐸TTS models. It loads the speaker files and parses the information in a way that you can query. There are 3 different scenarios considered. @@ -64,24 +130,24 @@ class SpeakerManager: self.speaker_encoder_ap = None if data_items: - self.speaker_ids = self.parse_speakers() + self.speaker_ids, _ = self.parse_speakers_from_data(self.data_items) if x_vectors_file_path: - self.load_x_vectors_file(x_vectors_file_path) + self.set_x_vectors_from_file(x_vectors_file_path) if speaker_id_file_path: - self.load_ids_file(speaker_id_file_path) + self.set_speaker_ids_from_file(speaker_id_file_path) if encoder_model_path and encoder_config_path: self.init_speaker_encoder(encoder_model_path, encoder_config_path) @staticmethod - def _load_json(json_file_path: str): + def _load_json(json_file_path: str) -> Dict: with open(json_file_path) as f: return json.load(f) @staticmethod - def _save_json(json_file_path: str, data: dict): + def _save_json(json_file_path: str, data: dict) -> None: with open(json_file_path, "w") as f: json.dump(data, f, indent=4) @@ -91,35 +157,101 @@ class SpeakerManager: @property def x_vector_dim(self): - return len(self.x_vectors[list(self.x_vectors.keys())[0]]["embedding"]) + """Dimensionality of x_vectors. If x_vectors are not loaded, returns zero.""" + if self.x_vectors: + return len(self.x_vectors[list(self.x_vectors.keys())[0]]["embedding"]) + return 0 - def parse_speakers_from_items(self, items: list): + @staticmethod + def parse_speakers_from_data(items: list) -> Tuple[Dict, int]: + """Parse speaker IDs from data samples retured by `load_meta_data()`. + + Args: + items (list): Data sampled returned by `load_meta_data()`. + + Returns: + Tuple[Dict, int]: speaker IDs and number of speakers. + """ speakers = sorted({item[2] for item in items}) - self.speaker_ids = {name: i for i, name in enumerate(speakers)} - num_speakers = len(self.speaker_ids) - return self.speaker_ids, num_speakers + speaker_ids = {name: i for i, name in enumerate(speakers)} + num_speakers = len(speaker_ids) + return speaker_ids, num_speakers - def save_ids_file(self, file_path: str): - self._save_json(file_path, self.speaker_ids) + def set_speaker_ids_from_data(self, items: List) -> None: + """Set speaker IDs from data samples. - def load_ids_file(self, file_path: str): + Args: + items (List): Data sampled returned by `load_meta_data()`. + """ + self.speaker_ids, _ = self.parse_speakers_from_data(items) + + def set_speaker_ids_from_file(self, file_path: str) -> None: + """Set speaker IDs from a file. + + Args: + file_path (str): Path to the file. + """ self.speaker_ids = self._load_json(file_path) - def save_x_vectors_file(self, file_path: str): + def save_speaker_ids_to_file(self, file_path: str) -> None: + """Save speaker IDs to a json file. + + Args: + file_path (str): Path to the output file. + """ + self._save_json(file_path, self.speaker_ids) + + def save_x_vectors_to_file(self, file_path: str) -> None: + """Save x_vectors to a json file. + + Args: + file_path (str): Path to the output file. + """ self._save_json(file_path, self.x_vectors) - def load_x_vectors_file(self, file_path: str): + def set_x_vectors_from_file(self, file_path: str) -> None: + """Load x_vectors from a json file. + + Args: + file_path (str): Path to the target json file. + """ self.x_vectors = self._load_json(file_path) self.speaker_ids = list(set(sorted(x["name"] for x in self.x_vectors.values()))) self.clip_ids = list(set(sorted(clip_name for clip_name in self.x_vectors.keys()))) - def get_x_vector_by_clip(self, clip_idx: str): + def get_x_vector_by_clip(self, clip_idx: str) -> List: + """Get x_vector by clip ID. + + Args: + clip_idx (str): Target clip ID. + + Returns: + List: x_vector as a list. + """ return self.x_vectors[clip_idx]["embedding"] - def get_x_vectors_by_speaker(self, speaker_idx: str): + def get_x_vectors_by_speaker(self, speaker_idx: str) -> List[List]: + """Get all x_vectors of a speaker. + + Args: + speaker_idx (str): Target speaker ID. + + Returns: + List[List]: all the x_vectors of the given speaker. + """ return [x["embedding"] for x in self.x_vectors.values() if x["name"] == speaker_idx] - def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False): + def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.Array: + """Get mean x_vector of a speaker ID. + + Args: + speaker_idx (str): Target speaker ID. + num_samples (int, optional): Number of samples to be averaged. Defaults to None. + randomize (bool, optional): Pick random `num_samples`of x_vectors. Defaults to False. + + Returns: + np.Array: Mean x_vector. + """ x_vectors = self.get_x_vectors_by_speaker(speaker_idx) if num_samples is None: x_vectors = np.stack(x_vectors).mean(0) @@ -131,13 +263,19 @@ class SpeakerManager: x_vectors = np.stack(x_vectors[:num_samples]).mean(0) return x_vectors - def get_speakers(self): + def get_speakers(self) -> List: return self.speaker_ids - def get_clips(self): + def get_clips(self) -> List: return sorted(self.x_vectors.keys()) def init_speaker_encoder(self, model_path: str, config_path: str) -> None: + """Initialize a speaker encoder model. + + Args: + model_path (str): Model file path. + config_path (str): Model config file path. + """ self.speaker_encoder_config = load_config(config_path) self.speaker_encoder = setup_model(self.speaker_encoder_config) self.speaker_encoder.load_checkpoint(config_path, model_path, True) @@ -147,6 +285,15 @@ class SpeakerManager: self.speaker_encoder_ap.do_trim_silence = True def compute_x_vector_from_clip(self, wav_file: Union[str, list]) -> list: + """Compute a x_vector from a given audio file. + + Args: + wav_file (Union[str, list]): Target file path. + + Returns: + list: Computed x_vector. + """ + def _compute(wav_file: str): waveform = self.speaker_encoder_ap.load_wav(wav_file, sr=self.speaker_encoder_ap.sample_rate) spec = self.speaker_encoder_ap.melspectrogram(waveform) @@ -168,7 +315,15 @@ class SpeakerManager: x_vector = _compute(wav_file) return x_vector[0].tolist() - def compute_x_vector(self, feats): + def compute_x_vector(self, feats: Union[torch.Tensor, np.Array]) -> List: + """Compute x_vector from features. + + Args: + feats (Union[torch.Tensor, np.Array]): Input features. + + Returns: + List: computed x_vector. + """ if isinstance(feats, np.ndarray): feats = torch.from_numpy(feats) if feats.ndim == 2: From 6c495c6a6ee4775d11b8a2026bd03a205f588b31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 28 May 2021 19:30:50 +0200 Subject: [PATCH 159/258] fix glow-tts inference and forward functions for handling `cond_input` and refactor its test --- TTS/tts/models/glow_tts.py | 28 ++++++++++++++++++++-------- tests/tts_tests/test_glow_tts.py | 25 ++++++++++++++++--------- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 9f20f6bb..2c944008 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -154,10 +154,10 @@ class GlowTTS(nn.Module): y_lengths: B g: [B, C] or B """ - y_max_length = y.size(2) y = y.transpose(1, 2) + y_max_length = y.size(2) # norm speaker embeddings - g = cond_input["x_vectors"] + g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None if g is not None: if self.speaker_embedding_dim: g = F.normalize(g).unsqueeze(-1) @@ -196,19 +196,23 @@ class GlowTTS(nn.Module): return outputs @torch.no_grad() - def inference_with_MAS(self, x, x_lengths, y=None, y_lengths=None, attn=None, g=None): + def inference_with_MAS( + self, x, x_lengths, y=None, y_lengths=None, cond_input={"x_vectors": None} + ): # pylint: disable=dangerous-default-value """ It's similar to the teacher forcing in Tacotron. It was proposed in: https://arxiv.org/abs/2104.05557 Shapes: x: [B, T] x_lenghts: B - y: [B, C, T] + y: [B, T, C] y_lengths: B g: [B, C] or B """ + y = y.transpose(1, 2) y_max_length = y.size(2) # norm speaker embeddings + g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None if g is not None: if self.external_speaker_embedding_dim: g = F.normalize(g).unsqueeze(-1) @@ -253,14 +257,18 @@ class GlowTTS(nn.Module): return outputs @torch.no_grad() - def decoder_inference(self, y, y_lengths=None, g=None): + def decoder_inference( + self, y, y_lengths=None, cond_input={"x_vectors": None} + ): # pylint: disable=dangerous-default-value """ Shapes: - y: [B, C, T] + y: [B, T, C] y_lengths: B g: [B, C] or B """ + y = y.transpose(1, 2) y_max_length = y.size(2) + g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None # norm speaker embeddings if g is not None: if self.external_speaker_embedding_dim: @@ -276,10 +284,14 @@ class GlowTTS(nn.Module): # reverse decoder and predict y, logdet = self.decoder(z, y_mask, g=g, reverse=True) - return y, logdet + outputs = {} + outputs["model_outputs"] = y + outputs["logdet"] = logdet + return outputs @torch.no_grad() - def inference(self, x, x_lengths, g=None): + def inference(self, x, x_lengths, cond_input={"x_vectors": None}): # pylint: disable=dangerous-default-value + g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None if g is not None: if self.speaker_embedding_dim: g = F.normalize(g).unsqueeze(-1) diff --git a/tests/tts_tests/test_glow_tts.py b/tests/tts_tests/test_glow_tts.py index 486de274..8a2a8fb3 100644 --- a/tests/tts_tests/test_glow_tts.py +++ b/tests/tts_tests/test_glow_tts.py @@ -34,7 +34,7 @@ class GlowTTSTrainTest(unittest.TestCase): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, c.audio["num_mels"], 30).to(device) + mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) speaker_ids = torch.randint(0, 5, (8,)).long().to(device) @@ -114,10 +114,17 @@ class GlowTTSTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=0.001) for _ in range(5): optimizer.zero_grad() - z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, None + outputs = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, None) + loss_dict = criterion( + outputs["model_outputs"], + outputs["y_mean"], + outputs["y_log_scale"], + outputs["logdet"], + mel_lengths, + outputs["durations_log"], + outputs["total_durations_log"], + input_lengths, ) - loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, o_dur_log, o_total_dur, input_lengths) loss = loss_dict["loss"] loss.backward() optimizer.step() @@ -137,7 +144,7 @@ class GlowTTSInferenceTest(unittest.TestCase): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, c.audio["num_mels"], 30).to(device) + mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) speaker_ids = torch.randint(0, 5, (8,)).long().to(device) @@ -175,12 +182,12 @@ class GlowTTSInferenceTest(unittest.TestCase): print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) # inference encoder and decoder with MAS - y, *_ = model.inference_with_MAS(input_dummy, input_lengths, mel_spec, mel_lengths, None) + y = model.inference_with_MAS(input_dummy, input_lengths, mel_spec, mel_lengths) - y_dec, _ = model.decoder_inference(mel_spec, mel_lengths) + y2 = model.decoder_inference(mel_spec, mel_lengths) assert ( - y_dec.shape == y.shape + y2["model_outputs"].shape == y["model_outputs"].shape ), "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format( - y.shape, y_dec.shape + y["model_outputs"].shape, y2["model_outputs"].shape ) From ef4ea9e527b4092d5196de5efee87064f94c0a38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 31 May 2021 10:07:12 +0200 Subject: [PATCH 160/258] update imports for `formatters` --- TTS/bin/compute_attention_masks.py | 2 +- TTS/bin/find_unique_chars.py | 6 +++--- notebooks/dataset_analysis/AnalyzeDataset.ipynb | 2 +- notebooks/dataset_analysis/PhonemeCoverage.ipynb | 2 +- tests/data_tests/test_dataset_formatters.py | 2 +- tests/data_tests/test_loader.py | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index 3cbf40ba..eb708040 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -80,7 +80,7 @@ Example run: model.eval() # data loader - preprocessor = importlib.import_module("TTS.tts.datasets.preprocess") + preprocessor = importlib.import_module("TTS.tts.datasets.formatters") preprocessor = getattr(preprocessor, args.dataset) meta_data = preprocessor(args.data_path, args.dataset_metafile) dataset = TTSDataset( diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py index 7891d65a..75169569 100644 --- a/TTS/bin/find_unique_chars.py +++ b/TTS/bin/find_unique_chars.py @@ -3,14 +3,14 @@ import argparse import os from argparse import RawTextHelpFormatter -from TTS.tts.datasets.preprocess import get_preprocessor_by_name +from TTS.tts.datasets.formatters import get_preprocessor_by_name def main(): # pylint: disable=bad-option-value parser = argparse.ArgumentParser( description="""Find all the unique characters or phonemes in a dataset.\n\n""" - """Target dataset must be defined in TTS.tts.datasets.preprocess\n\n""" + """Target dataset must be defined in TTS.tts.datasets.formatters\n\n""" """ Example runs: @@ -20,7 +20,7 @@ def main(): ) parser.add_argument( - "--dataset", type=str, default="", help="One of the target dataset names in TTS.tts.datasets.preprocess." + "--dataset", type=str, default="", help="One of the target dataset names in TTS.tts.datasets.formatters." ) parser.add_argument("--meta_file", type=str, default=None, help="Path to the transcriptions file of the dataset.") diff --git a/notebooks/dataset_analysis/AnalyzeDataset.ipynb b/notebooks/dataset_analysis/AnalyzeDataset.ipynb index 8aa3a025..6ff2d2ca 100644 --- a/notebooks/dataset_analysis/AnalyzeDataset.ipynb +++ b/notebooks/dataset_analysis/AnalyzeDataset.ipynb @@ -31,7 +31,7 @@ "from multiprocessing import Pool\n", "from matplotlib import pylab as plt\n", "from collections import Counter\n", - "from TTS.tts.datasets.preprocess import *\n", + "from TTS.tts.datasets.formatters import *\n", "%matplotlib inline" ] }, diff --git a/notebooks/dataset_analysis/PhonemeCoverage.ipynb b/notebooks/dataset_analysis/PhonemeCoverage.ipynb index f9540d06..e659511a 100644 --- a/notebooks/dataset_analysis/PhonemeCoverage.ipynb +++ b/notebooks/dataset_analysis/PhonemeCoverage.ipynb @@ -50,7 +50,7 @@ "source": [ "# import stuff\n", "from TTS.utils.io import load_config\n", - "from TTS.tts.datasets.preprocess import load_meta_data\n", + "from TTS.tts.datasets.formatters import load_meta_data\n", "from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme\n", "from tqdm import tqdm\n", "from matplotlib import pylab as plt\n", diff --git a/tests/data_tests/test_dataset_formatters.py b/tests/data_tests/test_dataset_formatters.py index 968e2a29..bd83002c 100644 --- a/tests/data_tests/test_dataset_formatters.py +++ b/tests/data_tests/test_dataset_formatters.py @@ -2,7 +2,7 @@ import os import unittest from tests import get_tests_input_path -from TTS.tts.datasets.preprocess import common_voice +from TTS.tts.datasets.formatters import common_voice class TestPreprocessors(unittest.TestCase): diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index 053da516..7f55b378 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -9,7 +9,7 @@ from torch.utils.data import DataLoader from tests import get_tests_output_path from TTS.tts.configs import BaseTTSConfig from TTS.tts.datasets import TTSDataset -from TTS.tts.datasets.preprocess import ljspeech +from TTS.tts.datasets.formatters import ljspeech from TTS.utils.audio import AudioProcessor # pylint: disable=unused-variable From 8381379938b8e300c9112ff69ea28fc61002f4b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 31 May 2021 15:41:17 +0200 Subject: [PATCH 161/258] formating `cond_input` with a function in Tacotron models --- TTS/tts/models/tacotron.py | 2 ++ TTS/tts/models/tacotron2.py | 2 ++ TTS/tts/models/tacotron_abstract.py | 6 ++++++ TTS/utils/generic_utils.py | 17 +++++++++++++++++ 4 files changed, 27 insertions(+) diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index da574c05..8d3124c3 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -191,6 +191,7 @@ class Tacotron(TacotronAbstract): mel_lengths: [B] cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] """ + cond_input = self._format_cond_input(cond_input) outputs = {"alignments_backward": None, "decoder_outputs_backward": None} input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) # B x T_in x embed_dim @@ -250,6 +251,7 @@ class Tacotron(TacotronAbstract): @torch.no_grad() def inference(self, text_input, cond_input=None): + cond_input = self._format_cond_input(cond_input) inputs = self.embedding(text_input) encoder_outputs = self.encoder(inputs) if self.gst and self.use_gst: diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 14a838d7..bd1ad03e 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -191,6 +191,7 @@ class Tacotron2(TacotronAbstract): mel_lengths: [B] cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] """ + cond_input = self._format_cond_input(cond_input) outputs = {"alignments_backward": None, "decoder_outputs_backward": None} # compute mask for padding # B x T_in_max (boolean) @@ -248,6 +249,7 @@ class Tacotron2(TacotronAbstract): @torch.no_grad() def inference(self, text, cond_input=None): + cond_input = self._format_cond_input(cond_input) embedded_inputs = self.embedding(text).transpose(1, 2) encoder_outputs = self.encoder.inference(embedded_inputs) diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py index 8eb7bf24..5e561066 100644 --- a/TTS/tts/models/tacotron_abstract.py +++ b/TTS/tts/models/tacotron_abstract.py @@ -1,10 +1,12 @@ import copy from abc import ABC, abstractmethod +from typing import Dict import torch from torch import nn from TTS.tts.utils.data import sequence_mask +from TTS.utils.generic_utils import format_cond_input from TTS.utils.training import gradual_training_scheduler @@ -94,6 +96,10 @@ class TacotronAbstract(ABC, nn.Module): self.decoder_backward = None self.coarse_decoder = None + @staticmethod + def _format_cond_input(cond_input: Dict) -> Dict: + return format_cond_input({"x_vectors": None, "speaker_ids": None}, cond_input) + ############################# # INIT FUNCTIONS ############################# diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index a562e86f..0c28116d 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -8,6 +8,7 @@ import shutil import subprocess import sys from pathlib import Path +from typing import Dict import torch @@ -126,6 +127,22 @@ def set_init_dict(model_dict, checkpoint_state, c): return model_dict +def format_cond_input(def_args: Dict, kwargs: Dict) -> Dict: + """Format kwargs to hande auxilary inputs to models. + + Args: + def_args (Dict): A dictionary of argument names and their default values if not defined in `kwargs`. + kwargs (Dict): A `dict` or `kwargs` that includes auxilary inputs to the model. + + Returns: + Dict: arguments with formatted auxilary inputs. + """ + for name in def_args: + if name not in kwargs: + kwargs[def_args[name]] = None + return kwargs + + class KeepAverage: def __init__(self): self.avg_values = {} From b22b7620c3dd515efc4daf2f0787b8bbe86b00d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 31 May 2021 15:42:07 +0200 Subject: [PATCH 162/258] update glow-tts output shapes to match [B, T, C] --- TTS/tts/models/glow_tts.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 2c944008..af52ba1c 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -185,13 +185,13 @@ class GlowTTS(nn.Module): y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask) attn = attn.squeeze(1).permute(0, 2, 1) outputs = { - "model_outputs": z, + "model_outputs": z.transpose(1, 2), "logdet": logdet, - "y_mean": y_mean, - "y_log_scale": y_log_scale, + "y_mean": y_mean.transpose(1, 2), + "y_log_scale": y_log_scale.transpose(1, 2), "alignments": attn, - "durations_log": o_dur_log, - "total_durations_log": o_attn_dur, + "durations_log": o_dur_log.transpose(1, 2), + "total_durations_log": o_attn_dur.transpose(1, 2), } return outputs @@ -246,13 +246,13 @@ class GlowTTS(nn.Module): # reverse the decoder and predict using the aligned distribution y, logdet = self.decoder(z, y_mask, g=g, reverse=True) outputs = { - "model_outputs": y, + "model_outputs": z.transpose(1, 2), "logdet": logdet, - "y_mean": y_mean, - "y_log_scale": y_log_scale, + "y_mean": y_mean.transpose(1, 2), + "y_log_scale": y_log_scale.transpose(1, 2), "alignments": attn, - "durations_log": o_dur_log, - "total_durations_log": o_attn_dur, + "durations_log": o_dur_log.transpose(1, 2), + "total_durations_log": o_attn_dur.transpose(1, 2), } return outputs @@ -285,7 +285,7 @@ class GlowTTS(nn.Module): y, logdet = self.decoder(z, y_mask, g=g, reverse=True) outputs = {} - outputs["model_outputs"] = y + outputs["model_outputs"] = y.transpose(1, 2) outputs["logdet"] = logdet return outputs @@ -317,13 +317,13 @@ class GlowTTS(nn.Module): y, logdet = self.decoder(z, y_mask, g=g, reverse=True) attn = attn.squeeze(1).permute(0, 2, 1) outputs = { - "model_outputs": y, + "model_outputs": y.transpose(1, 2), "logdet": logdet, - "y_mean": y_mean, - "y_log_scale": y_log_scale, + "y_mean": y_mean.transpose(1, 2), + "y_log_scale": y_log_scale.transpose(1, 2), "alignments": attn, - "durations_log": o_dur_log, - "total_durations_log": o_attn_dur, + "durations_log": o_dur_log.transpose(1, 2), + "total_durations_log": o_attn_dur.transpose(1, 2), } return outputs From 1443d03af1818d80fb3596d80306afe5fb682a36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 31 May 2021 15:43:40 +0200 Subject: [PATCH 163/258] update test for the new input output API of the tts models --- tests/data_tests/test_loader.py | 2 +- tests/tts_tests/test_speedy_speech_layers.py | 23 +++++-- tests/tts_tests/test_tacotron2_model.py | 72 ++++++++++---------- tests/tts_tests/test_tacotron_model.py | 50 +++++++------- 4 files changed, 79 insertions(+), 68 deletions(-) diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index 7f55b378..cad89d09 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -38,7 +38,7 @@ class TestTTSDataset(unittest.TestCase): def _create_dataloader(self, batch_size, r, bgs): items = ljspeech(c.data_path, "metadata.csv") - dataset = TTSDataset.TTSDataset( + dataset = TTSDataset( r, c.text_cleaner, compute_linear_spec=True, diff --git a/tests/tts_tests/test_speedy_speech_layers.py b/tests/tts_tests/test_speedy_speech_layers.py index 21a73812..66339a82 100644 --- a/tests/tts_tests/test_speedy_speech_layers.py +++ b/tests/tts_tests/test_speedy_speech_layers.py @@ -45,17 +45,25 @@ def test_speedy_speech(): model.cuda() # forward pass - o_de, o_dr, attn = model(x_dummy, x_lengths, y_lengths, durations) + outputs = model(x_dummy, x_lengths, y_lengths, durations) + o_de = outputs["model_outputs"] + attn = outputs["alignments"] + o_dr = outputs["durations_log"] - assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}" + assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}" assert list(attn.shape) == [B, T_de, T_en] assert list(o_dr.shape) == [B, T_en] # with speaker embedding model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128, num_speakers=10, c_in_channels=256).to(device) - model.forward(x_dummy, x_lengths, y_lengths, durations, g=torch.randint(0, 10, (B,)).to(device)) + model.forward( + x_dummy, x_lengths, y_lengths, durations, cond_input={"x_vectors": torch.randint(0, 10, (B,)).to(device)} + ) + o_de = outputs["model_outputs"] + attn = outputs["alignments"] + o_dr = outputs["durations_log"] - assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}" + assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}" assert list(attn.shape) == [B, T_de, T_en] assert list(o_dr.shape) == [B, T_en] @@ -63,8 +71,11 @@ def test_speedy_speech(): model = SpeedySpeech( num_chars, out_channels=80, hidden_channels=128, num_speakers=10, external_c=True, c_in_channels=256 ).to(device) - model.forward(x_dummy, x_lengths, y_lengths, durations, g=torch.rand((B, 256)).to(device)) + model.forward(x_dummy, x_lengths, y_lengths, durations, cond_input={"x_vectors": torch.rand((B, 256)).to(device)}) + o_de = outputs["model_outputs"] + attn = outputs["alignments"] + o_dr = outputs["durations_log"] - assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}" + assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}" assert list(attn.shape) == [B, T_de, T_en] assert list(o_dr.shape) == [B, T_en] diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index 4d711700..0933ec70 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -52,15 +52,15 @@ class TacotronTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): - mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} ) - assert torch.sigmoid(stop_tokens).data.max() <= 1.0 - assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -85,7 +85,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) - speaker_embeddings = torch.rand(8, 55).to(device) + speaker_ids = torch.rand(8, 55).to(device) for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 @@ -104,15 +104,15 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): - mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_ids} ) - assert torch.sigmoid(stop_tokens).data.max() <= 1.0 - assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -157,15 +157,15 @@ class TacotronGSTTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(10): - mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} ) - assert torch.sigmoid(stop_tokens).data.max() <= 1.0 - assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -213,15 +213,15 @@ class TacotronGSTTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(10): - mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} ) - assert torch.sigmoid(stop_tokens).data.max() <= 1.0 - assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -270,15 +270,15 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): - mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings} ) - assert torch.sigmoid(stop_tokens).data.max() <= 1.0 - assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py index fcbac0f7..86de5d16 100644 --- a/tests/tts_tests/test_tacotron_model.py +++ b/tests/tts_tests/test_tacotron_model.py @@ -68,13 +68,13 @@ class TacotronTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): - mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} ) optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -129,13 +129,13 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): - mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings} ) optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -193,13 +193,13 @@ class TacotronGSTTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(10): - mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} ) optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -256,13 +256,13 @@ class TacotronGSTTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(10): - mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} ) optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -318,13 +318,13 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): - mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings} ) optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes From 6d6896fd999dda0ef97f4bd90403f960e77ae3e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 31 May 2021 15:44:09 +0200 Subject: [PATCH 164/258] reduce fullband-melgan test model size --- tests/vocoder_tests/test_fullband_melgan_train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py index 2b286b91..6e533eb9 100644 --- a/tests/vocoder_tests/test_fullband_melgan_train.py +++ b/tests/vocoder_tests/test_fullband_melgan_train.py @@ -22,6 +22,7 @@ config = FullbandMelganConfig( print_eval=True, discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, data_path="tests/data/ljspeech", + discriminator_model_params={"base_channels": 16, "max_channels": 128, "downsample_factors": [4, 4, 4]}, output_path=output_path, ) config.audio.do_trim_silence = True From 87c61d210ab8409f30d635445e6c94f149d9abd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 2 Jun 2021 10:16:38 +0200 Subject: [PATCH 165/258] update test to be less demanding --- tests/vocoder_tests/test_fullband_melgan_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py index 6e533eb9..fbce03eb 100644 --- a/tests/vocoder_tests/test_fullband_melgan_train.py +++ b/tests/vocoder_tests/test_fullband_melgan_train.py @@ -22,7 +22,7 @@ config = FullbandMelganConfig( print_eval=True, discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, data_path="tests/data/ljspeech", - discriminator_model_params={"base_channels": 16, "max_channels": 128, "downsample_factors": [4, 4, 4]}, + discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, output_path=output_path, ) config.audio.do_trim_silence = True From 30211512a44bbb62a9fe9e074ef055414960bdc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 1 Jun 2021 09:45:59 +0200 Subject: [PATCH 166/258] fix type annotations --- TTS/tts/utils/speakers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 5c10c589..cebf0dca 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -241,7 +241,7 @@ class SpeakerManager: """ return [x["embedding"] for x in self.x_vectors.values() if x["name"] == speaker_idx] - def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.Array: + def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: """Get mean x_vector of a speaker ID. Args: @@ -250,7 +250,7 @@ class SpeakerManager: randomize (bool, optional): Pick random `num_samples`of x_vectors. Defaults to False. Returns: - np.Array: Mean x_vector. + np.ndarray: Mean x_vector. """ x_vectors = self.get_x_vectors_by_speaker(speaker_idx) if num_samples is None: @@ -315,11 +315,11 @@ class SpeakerManager: x_vector = _compute(wav_file) return x_vector[0].tolist() - def compute_x_vector(self, feats: Union[torch.Tensor, np.Array]) -> List: + def compute_x_vector(self, feats: Union[torch.Tensor, np.ndarray]) -> List: """Compute x_vector from features. Args: - feats (Union[torch.Tensor, np.Array]): Input features. + feats (Union[torch.Tensor, np.ndarray]): Input features. Returns: List: computed x_vector. From b9a52dce9e1ff6bcf9a2a33d2d0b58b4db8491fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 1 Jun 2021 10:48:13 +0200 Subject: [PATCH 167/258] add `test_all` to makefile --- Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile b/Makefile index 4dc2d588..70b7e34a 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,10 @@ dev-deps: ## install development deps deps: ## install 🐸 requirements. pip install -r requirements.txt +test_all: ## run tests and don't stop on an error. + nosetests --with-cov -cov --cover-erase --cover-package TTS tests --nologcapture --with-id + ./run_bash_tests.sh + test: ## run tests. nosetests -x --with-cov -cov --cover-erase --cover-package TTS tests --nologcapture --with-id ./run_bash_tests.sh From 8cdd423234317d1d3698f215da47cb1743e8f99e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 25 May 2021 14:41:13 +0200 Subject: [PATCH 168/258] styling formatting.py --- TTS/utils/arguments.py | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 9d92ae82..55bad4f2 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -29,16 +29,16 @@ def init_arguments(argv): parser.add_argument( "--continue_path", type=str, - help=( - "Training output folder to continue training. Used to continue " - "a training. If it is used, 'config_path' is ignored." - ), + help=("Training output folder to continue training. Used to continue " + "a training. If it is used, 'config_path' is ignored."), default="", required="--config_path" not in argv, ) parser.add_argument( - "--restore_path", type=str, help="Model file to be restored. Use to finetune a model.", default="" - ) + "--restore_path", + type=str, + help="Model file to be restored. Use to finetune a model.", + default="") parser.add_argument( "--best_path", type=str, @@ -48,12 +48,23 @@ def init_arguments(argv): ), default="", ) + parser.add_argument("--config_path", + type=str, + help="Path to config file for training.", + required="--continue_path" not in argv) + parser.add_argument("--debug", + type=bool, + default=False, + help="Do not verify commit integrity to run training.") parser.add_argument( - "--config_path", type=str, help="Path to config file for training.", required="--continue_path" not in argv - ) - parser.add_argument("--debug", type=bool, default=False, help="Do not verify commit integrity to run training.") - parser.add_argument("--rank", type=int, default=0, help="DISTRIBUTED: process rank for distributed training.") - parser.add_argument("--group_id", type=str, default="", help="DISTRIBUTED: process group id.") + "--rank", + type=int, + default=0, + help="DISTRIBUTED: process rank for distributed training.") + parser.add_argument("--group_id", + type=str, + default="", + help="DISTRIBUTED: process group id.") return parser @@ -148,7 +159,8 @@ def process_args(args): print(" > Mixed precision mode is ON") experiment_path = args.continue_path if not experiment_path: - experiment_path = create_experiment_folder(config.output_path, config.run_name, args.debug) + experiment_path = create_experiment_folder(config.output_path, + config.run_name, args.debug) audio_path = os.path.join(experiment_path, "test_audios") # setup rank 0 process in distributed training tb_logger = None @@ -169,7 +181,8 @@ def process_args(args): os.chmod(experiment_path, 0o775) tb_logger = TensorboardLogger(experiment_path, model_name=config.model) # write model desc to tensorboard - tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) + tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", + 0) c_logger = ConsoleLogger() return config, experiment_path, audio_path, c_logger, tb_logger From 1c8a3d7c86efed7e637f5f7e3b0b615c5427452c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 27 May 2021 17:25:00 +0200 Subject: [PATCH 169/258] make style --- TTS/tts/models/align_tts.py | 2 +- TTS/tts/models/speedy_speech.py | 2 +- TTS/tts/models/tacotron.py | 4 +--- TTS/utils/arguments.py | 39 +++++++++++---------------------- 4 files changed, 16 insertions(+), 31 deletions(-) diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 6efa64e2..3e8d4adc 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -275,7 +275,7 @@ class AlignTTS(nn.Module): g: [B, C] """ g = cond_input["x_vectors"] if "x_vectors" in cond_input else None - x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # pylint: disable=not-callable + x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # pad input to prevent dropping the last word # x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0) o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index 96ef1740..455dbf38 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -183,7 +183,7 @@ class SpeedySpeech(nn.Module): g: [B, C] """ g = cond_input["x_vectors"] if "x_vectors" in cond_input else None - x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # pylint: disable=not-callable + x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # input sequence should be greated than the max convolution size inference_padding = 5 if x.shape[1] < 13: diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 8d3124c3..12c3e5f9 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -191,11 +191,9 @@ class Tacotron(TacotronAbstract): mel_lengths: [B] cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] """ - cond_input = self._format_cond_input(cond_input) outputs = {"alignments_backward": None, "decoder_outputs_backward": None} - input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) - # B x T_in x embed_dim inputs = self.embedding(text) + input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) # B x T_in x encoder_in_features encoder_outputs = self.encoder(inputs) # sequence masking diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py index 55bad4f2..9d92ae82 100644 --- a/TTS/utils/arguments.py +++ b/TTS/utils/arguments.py @@ -29,16 +29,16 @@ def init_arguments(argv): parser.add_argument( "--continue_path", type=str, - help=("Training output folder to continue training. Used to continue " - "a training. If it is used, 'config_path' is ignored."), + help=( + "Training output folder to continue training. Used to continue " + "a training. If it is used, 'config_path' is ignored." + ), default="", required="--config_path" not in argv, ) parser.add_argument( - "--restore_path", - type=str, - help="Model file to be restored. Use to finetune a model.", - default="") + "--restore_path", type=str, help="Model file to be restored. Use to finetune a model.", default="" + ) parser.add_argument( "--best_path", type=str, @@ -48,23 +48,12 @@ def init_arguments(argv): ), default="", ) - parser.add_argument("--config_path", - type=str, - help="Path to config file for training.", - required="--continue_path" not in argv) - parser.add_argument("--debug", - type=bool, - default=False, - help="Do not verify commit integrity to run training.") parser.add_argument( - "--rank", - type=int, - default=0, - help="DISTRIBUTED: process rank for distributed training.") - parser.add_argument("--group_id", - type=str, - default="", - help="DISTRIBUTED: process group id.") + "--config_path", type=str, help="Path to config file for training.", required="--continue_path" not in argv + ) + parser.add_argument("--debug", type=bool, default=False, help="Do not verify commit integrity to run training.") + parser.add_argument("--rank", type=int, default=0, help="DISTRIBUTED: process rank for distributed training.") + parser.add_argument("--group_id", type=str, default="", help="DISTRIBUTED: process group id.") return parser @@ -159,8 +148,7 @@ def process_args(args): print(" > Mixed precision mode is ON") experiment_path = args.continue_path if not experiment_path: - experiment_path = create_experiment_folder(config.output_path, - config.run_name, args.debug) + experiment_path = create_experiment_folder(config.output_path, config.run_name, args.debug) audio_path = os.path.join(experiment_path, "test_audios") # setup rank 0 process in distributed training tb_logger = None @@ -181,8 +169,7 @@ def process_args(args): os.chmod(experiment_path, 0o775) tb_logger = TensorboardLogger(experiment_path, model_name=config.model) # write model desc to tensorboard - tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", - 0) + tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) c_logger = ConsoleLogger() return config, experiment_path, audio_path, c_logger, tb_logger From 1fa15c195ae7e40bf92744c2bc3be0a015d694d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 1 Jun 2021 14:35:15 +0200 Subject: [PATCH 170/258] docstring fix --- TTS/tts/models/tacotron2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index bd1ad03e..68867ec8 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -45,7 +45,7 @@ class Tacotron2(TacotronAbstract): speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None. use_gst (bool, optional): enable/disable Global style token module. gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. - gradual_trainin (List): Gradual training schedule. If None or `[]`, no gradual training is used. + gradual_training (List): Gradual training schedule. If None or `[]`, no gradual training is used. Defaults to `[]`. """ From 9c94b0c5c060699d57e286137b3556ab55dff840 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 1 Jun 2021 14:53:57 +0200 Subject: [PATCH 171/258] init `durations = None` --- TTS/trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TTS/trainer.py b/TTS/trainer.py index 34d73874..d81132cf 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -351,6 +351,7 @@ class TrainerTTS: speaker_ids = None # compute durations from attention masks + durations = None if attn_mask is not None: durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2]) for idx, am in enumerate(attn_mask): From f82f1970b8847185783dcf2a1a295260c2863616 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 1 Jun 2021 16:26:28 +0200 Subject: [PATCH 172/258] change `to(device)` to `type_as` in models --- TTS/tts/models/tacotron_abstract.py | 12 +++++------- TTS/tts/tf/utils/generic_utils.py | 3 +-- TTS/tts/utils/ssim.py | 9 +-------- TTS/vocoder/models/wavernn.py | 13 ++++++------- TTS/vocoder/utils/distribution.py | 4 +--- 5 files changed, 14 insertions(+), 27 deletions(-) diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py index 5e561066..fe43d81f 100644 --- a/TTS/tts/models/tacotron_abstract.py +++ b/TTS/tts/models/tacotron_abstract.py @@ -145,14 +145,13 @@ class TacotronAbstract(ABC, nn.Module): def compute_masks(self, text_lengths, mel_lengths): """Compute masks against sequence paddings.""" # B x T_in_max (boolean) - device = text_lengths.device - input_mask = sequence_mask(text_lengths).to(device) + input_mask = sequence_mask(text_lengths) output_mask = None if mel_lengths is not None: max_len = mel_lengths.max() r = self.decoder.r max_len = max_len + (r - (max_len % r)) if max_len % r > 0 else max_len - output_mask = sequence_mask(mel_lengths, max_len=max_len).to(device) + output_mask = sequence_mask(mel_lengths, max_len=max_len) return input_mask, output_mask def _backward_pass(self, mel_specs, encoder_outputs, mask): @@ -195,20 +194,19 @@ class TacotronAbstract(ABC, nn.Module): def compute_gst(self, inputs, style_input, speaker_embedding=None): """Compute global style token""" - device = inputs.device if isinstance(style_input, dict): - query = torch.zeros(1, 1, self.gst.gst_embedding_dim // 2).to(device) + query = torch.zeros(1, 1, self.gst.gst_embedding_dim // 2).type_as(inputs) if speaker_embedding is not None: query = torch.cat([query, speaker_embedding.reshape(1, 1, -1)], dim=-1) _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) - gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).to(device) + gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).type_as(inputs) for k_token, v_amplifier in style_input.items(): key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) gst_outputs = gst_outputs + gst_outputs_att * v_amplifier elif style_input is None: - gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).to(device) + gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).type_as(inputs) else: gst_outputs = self.gst_layer(style_input, speaker_embedding) # pylint: disable=not-callable inputs = self._concat_speaker_embedding(inputs, gst_outputs) diff --git a/TTS/tts/tf/utils/generic_utils.py b/TTS/tts/tf/utils/generic_utils.py index 5b8b4ce2..e76893c2 100644 --- a/TTS/tts/tf/utils/generic_utils.py +++ b/TTS/tts/tf/utils/generic_utils.py @@ -44,8 +44,7 @@ def sequence_mask(sequence_length, max_len=None): batch_size = sequence_length.size(0) seq_range = np.empty([0, max_len], dtype=np.int8) seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - if sequence_length.is_cuda: - seq_range_expand = seq_range_expand.cuda() + seq_range_expand = seq_range_expand.type_as(sequence_length) seq_length_expand = sequence_length.unsqueeze(1).expand_as(seq_range_expand) # B x T_max return seq_range_expand < seq_length_expand diff --git a/TTS/tts/utils/ssim.py b/TTS/tts/utils/ssim.py index 11107e47..caed575f 100644 --- a/TTS/tts/utils/ssim.py +++ b/TTS/tts/utils/ssim.py @@ -56,9 +56,6 @@ class SSIM(torch.nn.Module): window = self.window else: window = create_window(self.window_size, channel) - - if img1.is_cuda: - window = window.cuda(img1.get_device()) window = window.type_as(img1) self.window = window @@ -69,10 +66,6 @@ class SSIM(torch.nn.Module): def ssim(img1, img2, window_size=11, size_average=True): (_, channel, _, _) = img1.size() - window = create_window(window_size, channel) - - if img1.is_cuda: - window = window.cuda(img1.get_device()) + window = create_window(window_size, channel).type_as(img1) window = window.type_as(img1) - return _ssim(img1, img2, window, window_size, channel, size_average) diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 994244dc..04040931 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -251,7 +251,6 @@ class WaveRNN(nn.Module): def inference(self, mels, batched=None, target=None, overlap=None): self.eval() - device = mels.device output = [] start = time.time() rnn1 = self.get_gru_cell(self.rnn1) @@ -259,7 +258,7 @@ class WaveRNN(nn.Module): with torch.no_grad(): if isinstance(mels, np.ndarray): - mels = torch.FloatTensor(mels).to(device) + mels = torch.FloatTensor(mels).type_as(mels) if mels.ndim == 2: mels = mels.unsqueeze(0) @@ -275,9 +274,9 @@ class WaveRNN(nn.Module): b_size, seq_len, _ = mels.size() - h1 = torch.zeros(b_size, self.rnn_dims).to(device) - h2 = torch.zeros(b_size, self.rnn_dims).to(device) - x = torch.zeros(b_size, 1).to(device) + h1 = torch.zeros(b_size, self.rnn_dims).type_as(mels) + h2 = torch.zeros(b_size, self.rnn_dims).type_as(mels) + x = torch.zeros(b_size, 1).type_as(mels) if self.use_aux_net: d = self.aux_dims @@ -310,11 +309,11 @@ class WaveRNN(nn.Module): if self.mode == "mold": sample = sample_from_discretized_mix_logistic(logits.unsqueeze(0).transpose(1, 2)) output.append(sample.view(-1)) - x = sample.transpose(0, 1).to(device) + x = sample.transpose(0, 1).type_as(mels) elif self.mode == "gauss": sample = sample_from_gaussian(logits.unsqueeze(0).transpose(1, 2)) output.append(sample.view(-1)) - x = sample.transpose(0, 1).to(device) + x = sample.transpose(0, 1).type_as(mels) elif isinstance(self.mode, int): posterior = F.softmax(logits, dim=1) distrib = torch.distributions.Categorical(posterior) diff --git a/TTS/vocoder/utils/distribution.py b/TTS/vocoder/utils/distribution.py index 5c2742c8..43d0d884 100644 --- a/TTS/vocoder/utils/distribution.py +++ b/TTS/vocoder/utils/distribution.py @@ -149,8 +149,6 @@ def sample_from_discretized_mix_logistic(y, log_scale_min=None): def to_one_hot(tensor, n, fill_with=1.0): # we perform one hot encore with respect to the last axis - one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_() - if tensor.is_cuda: - one_hot = one_hot.cuda() + one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_().type_as(tensor) one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with) return one_hot From 877bf66b61badc18807420e86e6ad9578ae605d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 2 Jun 2021 10:17:58 +0200 Subject: [PATCH 173/258] reduce size of the metadata.csv used at testing --- tests/data/ljspeech/metadata.csv | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/tests/data/ljspeech/metadata.csv b/tests/data/ljspeech/metadata.csv index 8f7832b5..6c65ca0d 100644 --- a/tests/data/ljspeech/metadata.csv +++ b/tests/data/ljspeech/metadata.csv @@ -6,27 +6,3 @@ LJ001-0005|the invention of movable metal letters in the middle of the fifteenth LJ001-0006|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography, LJ001-0007|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five, LJ001-0008|has never been surpassed.|has never been surpassed. -LJ001-0009|Printing, then, for our purpose, may be considered as the art of making books by means of movable types.|Printing, then, for our purpose, may be considered as the art of making books by means of movable types. -LJ001-0010|Now, as all books not primarily intended as picture-books consist principally of types composed to form letterpress,|Now, as all books not primarily intended as picture-books consist principally of types composed to form letterpress, -LJ001-0011|it is of the first importance that the letter used should be fine in form;|it is of the first importance that the letter used should be fine in form; -LJ001-0012|especially as no more time is occupied, or cost incurred, in casting, setting, or printing beautiful letters|especially as no more time is occupied, or cost incurred, in casting, setting, or printing beautiful letters -LJ001-0013|than in the same operations with ugly ones.|than in the same operations with ugly ones. -LJ001-0014|And it was a matter of course that in the Middle Ages, when the craftsmen took care that beautiful form should always be a part of their productions whatever they were,|And it was a matter of course that in the Middle Ages, when the craftsmen took care that beautiful form should always be a part of their productions whatever they were, -LJ001-0015|the forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves.|the forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. -LJ001-0016|The Middle Ages brought calligraphy to perfection, and it was natural therefore|The Middle Ages brought calligraphy to perfection, and it was natural therefore -LJ001-0017|that the forms of printed letters should follow more or less closely those of the written character, and they followed them very closely.|that the forms of printed letters should follow more or less closely those of the written character, and they followed them very closely. -LJ001-0018|The first books were printed in black letter, i.e. the letter which was a Gothic development of the ancient Roman character,|The first books were printed in black letter, i.e. the letter which was a Gothic development of the ancient Roman character, -LJ001-0019|and which developed more completely and satisfactorily on the side of the "lower-case" than the capital letters;|and which developed more completely and satisfactorily on the side of the "lower-case" than the capital letters; -LJ001-0020|the "lower-case" being in fact invented in the early Middle Ages.|the "lower-case" being in fact invented in the early Middle Ages. -LJ001-0021|The earliest book printed with movable type, the aforesaid Gutenberg Bible, is printed in letters which are an exact imitation|The earliest book printed with movable type, the aforesaid Gutenberg Bible, is printed in letters which are an exact imitation -LJ001-0022|of the more formal ecclesiastical writing which obtained at that time; this has since been called "missal type,"|of the more formal ecclesiastical writing which obtained at that time; this has since been called "missal type," -LJ001-0023|and was in fact the kind of letter used in the many splendid missals, psalters, etc., produced by printing in the fifteenth century.|and was in fact the kind of letter used in the many splendid missals, psalters, etc., produced by printing in the fifteenth century. -LJ001-0024|But the first Bible actually dated (which also was printed at Maintz by Peter Schoeffer in the year 1462)|But the first Bible actually dated (which also was printed at Maintz by Peter Schoeffer in the year fourteen sixty-two) -LJ001-0025|imitates a much freer hand, simpler, rounder, and less spiky, and therefore far pleasanter and easier to read.|imitates a much freer hand, simpler, rounder, and less spiky, and therefore far pleasanter and easier to read. -LJ001-0026|On the whole the type of this book may be considered the ne-plus-ultra of Gothic type,|On the whole the type of this book may be considered the ne-plus-ultra of Gothic type, -LJ001-0027|especially as regards the lower-case letters; and type very similar was used during the next fifteen or twenty years not only by Schoeffer,|especially as regards the lower-case letters; and type very similar was used during the next fifteen or twenty years not only by Schoeffer, -LJ001-0028|but by printers in Strasburg, Basle, Paris, Lubeck, and other cities.|but by printers in Strasburg, Basle, Paris, Lubeck, and other cities. -LJ001-0029|But though on the whole, except in Italy, Gothic letter was most often used|But though on the whole, except in Italy, Gothic letter was most often used -LJ001-0030|a very few years saw the birth of Roman character not only in Italy, but in Germany and France.|a very few years saw the birth of Roman character not only in Italy, but in Germany and France. -LJ001-0031|In 1465 Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome,|In fourteen sixty-five Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome, -LJ001-0032|and used an exceedingly beautiful type, which is indeed to look at a transition between Gothic and Roman,|and used an exceedingly beautiful type, which is indeed to look at a transition between Gothic and Roman, \ No newline at end of file From 9042ae919508e6ffff186daa540966745449f71e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 3 Jun 2021 15:05:39 +0200 Subject: [PATCH 174/258] use `to_cuda()` for moving data in `format_batch()` --- TTS/trainer.py | 22 +++++++++++----------- TTS/tts/datasets/TTSDataset.py | 2 +- TTS/utils/generic_utils.py | 9 +++++++++ 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index d81132cf..8ec59f55 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -27,7 +27,7 @@ from TTS.tts.utils.text.symbols import make_symbols from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor from TTS.utils.distribute import init_distributed -from TTS.utils.generic_utils import KeepAverage, count_parameters, set_init_dict +from TTS.utils.generic_utils import KeepAverage, count_parameters, set_init_dict, to_cuda from TTS.utils.logging import ConsoleLogger, TensorboardLogger from TTS.utils.training import check_update, setup_torch_training_env @@ -377,18 +377,18 @@ class TrainerTTS: # dispatch batch to GPU if self.use_cuda: - text_input = text_input.cuda(non_blocking=True) - text_lengths = text_lengths.cuda(non_blocking=True) - mel_input = mel_input.cuda(non_blocking=True) - mel_lengths = mel_lengths.cuda(non_blocking=True) - linear_input = linear_input.cuda(non_blocking=True) if self.config.model.lower() in ["tacotron"] else None - stop_targets = stop_targets.cuda(non_blocking=True) - attn_mask = attn_mask.cuda(non_blocking=True) if attn_mask is not None else None - durations = durations.cuda(non_blocking=True) if attn_mask is not None else None + text_input = to_cuda(text_input) + text_lengths = to_cuda(text_lengths) + mel_input = to_cuda(mel_input) + mel_lengths = to_cuda(mel_lengths) + linear_input = to_cuda(linear_input) if self.config.model.lower() in ["tacotron"] else None + stop_targets = to_cuda(stop_targets) + attn_mask = to_cuda(attn_mask) if attn_mask is not None else None + durations = to_cuda(durations) if attn_mask is not None else None if speaker_ids is not None: - speaker_ids = speaker_ids.cuda(non_blocking=True) + speaker_ids = to_cuda(speaker_ids) if speaker_embeddings is not None: - speaker_embeddings = speaker_embeddings.cuda(non_blocking=True) + speaker_embeddings = to_cuda(speaker_embeddings) return { "text_input": text_input, diff --git a/TTS/tts/datasets/TTSDataset.py b/TTS/tts/datasets/TTSDataset.py index cbb0a593..76f82c97 100644 --- a/TTS/tts/datasets/TTSDataset.py +++ b/TTS/tts/datasets/TTSDataset.py @@ -282,7 +282,7 @@ class TTSDataset(Dataset): """ # Puts each data field into a tensor with outer dimension batch size - if isinstance(batch[0], collections.Mapping): + if isinstance(batch[0], collections.abc.Mapping): text_lenghts = np.array([len(d["text"]) for d in batch]) diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 0c28116d..a1abf5fe 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -13,6 +13,15 @@ from typing import Dict import torch +def to_cuda(x: torch.Tensor) -> torch.Tensor: + if x is None: + return None + x = x.contiguous() + if torch.cuda.is_available(): + x = x.cuda(non_blocking=True) + return x + + def get_cuda(): use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") From db6a97d1a26a99bd4107d6d941fb5c166d5d69ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 3 Jun 2021 11:42:40 +0200 Subject: [PATCH 175/258] rename external speaker embedding arguments as `d_vectors` --- TTS/bin/extract_tts_spectrograms.py | 23 ++-- TTS/bin/synthesize.py | 2 +- TTS/trainer.py | 26 ++-- TTS/tts/layers/tacotron/gst_layers.py | 10 +- TTS/tts/layers/tacotron/tacotron.py | 2 +- TTS/tts/models/__init__.py | 8 +- TTS/tts/models/align_tts.py | 12 +- TTS/tts/models/glow_tts.py | 42 +++---- TTS/tts/models/speedy_speech.py | 12 +- TTS/tts/models/tacotron.py | 44 +++---- TTS/tts/models/tacotron2.py | 42 +++---- TTS/tts/models/tacotron_abstract.py | 40 +++---- TTS/tts/utils/speakers.py | 118 +++++++++---------- TTS/tts/utils/synthesis.py | 24 ++-- TTS/utils/synthesizer.py | 18 +-- tests/test_extract_tts_spectrograms.py | 6 +- tests/test_speaker_manager.py | 54 ++++----- tests/tts_tests/test_speedy_speech_layers.py | 4 +- tests/tts_tests/test_tacotron2_model.py | 8 +- tests/tts_tests/test_tacotron_model.py | 8 +- 20 files changed, 251 insertions(+), 252 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 016b389f..3acf5d02 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -108,9 +108,8 @@ def format_data(data): mel_lengths = mel_lengths.cuda(non_blocking=True) if speaker_ids is not None: speaker_ids = speaker_ids.cuda(non_blocking=True) - if speaker_embeddings is not None: - speaker_embeddings = speaker_embeddings.cuda(non_blocking=True) - + if d_vectors is not None: + d_vectors = d_vectors.cuda(non_blocking=True) if attn_mask is not None: attn_mask = attn_mask.cuda(non_blocking=True) return ( @@ -119,7 +118,7 @@ def format_data(data): mel_input, mel_lengths, speaker_ids, - speaker_embeddings, + d_vectors, avg_text_length, avg_spec_length, attn_mask, @@ -137,23 +136,23 @@ def inference( mel_input, mel_lengths, speaker_ids=None, - speaker_embeddings=None, + d_vectors=None, ): if model_name == "glow_tts": speaker_c = None if speaker_ids is not None: speaker_c = speaker_ids - elif speaker_embeddings is not None: - speaker_c = speaker_embeddings + elif d_vectors is not None: + speaker_c = d_vectors outputs = model.inference_with_MAS( - text_input, text_lengths, mel_input, mel_lengths, cond_input={"x_vectors": speaker_c} + text_input, text_lengths, mel_input, mel_lengths, cond_input={"d_vectors": speaker_c} ) model_output = outputs["model_outputs"] model_output = model_output.transpose(1, 2).detach().cpu().numpy() elif "tacotron" in model_name: - cond_input = {"speaker_ids": speaker_ids, "x_vectors": speaker_embeddings} + cond_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} outputs = model(text_input, text_lengths, mel_input, mel_lengths, cond_input) postnet_outputs = outputs["model_outputs"] # normalize tacotron output @@ -184,7 +183,7 @@ def extract_spectrograms( mel_input, mel_lengths, speaker_ids, - speaker_embeddings, + d_vectors, _, _, _, @@ -200,7 +199,7 @@ def extract_spectrograms( mel_input, mel_lengths, speaker_ids, - speaker_embeddings, + d_vectors, ) for idx in range(text_input.shape[0]): @@ -256,7 +255,7 @@ def main(args): # pylint: disable=redefined-outer-name speaker_manager = get_speaker_manager(c, args, meta_data_train) # setup model - model = setup_model(num_chars, speaker_manager.num_speakers, c, speaker_embedding_dim=speaker_manager.x_vector_dim) + model = setup_model(num_chars, speaker_manager.num_speakers, c, d_vector_dim=speaker_manager.d_vector_dim) # restore model checkpoint = torch.load(args.checkpoint_path, map_location="cpu") diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index a5066e3d..3cde5612 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -157,7 +157,7 @@ def main(): parser.add_argument( "--speaker_wav", nargs="+", - help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The x_vectors is computed as their average.", + help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.", default=None, ) parser.add_argument("--gst_style", help="Wav path file for GST stylereference.", default=None) diff --git a/TTS/trainer.py b/TTS/trainer.py index 8ec59f55..55560624 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -113,7 +113,7 @@ class TrainerTTS: len(self.model_characters), self.speaker_manager.num_speakers, self.config, - self.speaker_manager.x_vector_dim if self.speaker_manager.x_vectors else None, + self.speaker_manager.d_vector_dim if self.speaker_manager.d_vectors else None, ) # setup criterion @@ -156,8 +156,8 @@ class TrainerTTS: print("\n > Model has {} parameters".format(num_params)) @staticmethod - def get_model(num_chars: int, num_speakers: int, config: Coqpit, x_vector_dim: int) -> nn.Module: - model = setup_model(num_chars, num_speakers, config, x_vector_dim) + def get_model(num_chars: int, num_speakers: int, config: Coqpit, d_vector_dim: int) -> nn.Module: + model = setup_model(num_chars, num_speakers, config, d_vector_dim) return model @staticmethod @@ -196,11 +196,11 @@ class TrainerTTS: speakers_file = config.external_speaker_embedding_file if config.use_external_speaker_embedding_file: - speaker_manager.load_x_vectors_file(speakers_file) + speaker_manager.load_d_vectors_file(speakers_file) else: speaker_manager.load_ids_file(speakers_file) elif config.use_external_speaker_embedding_file and config.external_speaker_embedding_file: - speaker_manager.load_x_vectors_file(config.external_speaker_embedding_file) + speaker_manager.load_d_vectors_file(config.external_speaker_embedding_file) else: speaker_manager.parse_speakers_from_items(data_train) file_path = os.path.join(out_path, "speakers.json") @@ -387,8 +387,8 @@ class TrainerTTS: durations = to_cuda(durations) if attn_mask is not None else None if speaker_ids is not None: speaker_ids = to_cuda(speaker_ids) - if speaker_embeddings is not None: - speaker_embeddings = to_cuda(speaker_embeddings) + if d_vectors is not None: + d_vectors = to_cuda(d_vectors) return { "text_input": text_input, @@ -400,7 +400,7 @@ class TrainerTTS: "attn_mask": attn_mask, "durations": durations, "speaker_ids": speaker_ids, - "x_vectors": speaker_embeddings, + "d_vectors": d_vectors, "max_text_length": max_text_length, "max_spec_length": max_spec_length, "item_idx": item_idx, @@ -591,7 +591,7 @@ class TrainerTTS: self.use_cuda, self.ap, speaker_id=cond_inputs["speaker_id"], - x_vector=cond_inputs["x_vector"], + d_vector=cond_inputs["d_vector"], style_wav=cond_inputs["style_wav"], enable_eos_bos_chars=self.config.enable_eos_bos_chars, use_griffin_lim=True, @@ -612,9 +612,9 @@ class TrainerTTS: def _get_cond_inputs(self) -> Dict: # setup speaker_id speaker_id = 0 if self.config.use_speaker_embedding else None - # setup x_vector - x_vector = ( - self.speaker_manager.get_x_vectors_by_speaker(self.speaker_manager.speaker_ids[0]) + # setup d_vector + d_vector = ( + self.speaker_manager.get_d_vectors_by_speaker(self.speaker_manager.speaker_ids[0]) if self.config.use_external_speaker_embedding_file and self.config.use_speaker_embedding else None ) @@ -629,7 +629,7 @@ class TrainerTTS: print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") for i in range(self.config.gst["gst_num_style_tokens"]): style_wav[str(i)] = 0 - cond_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "x_vector": x_vector} + cond_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector} return cond_inputs def fit(self) -> None: diff --git a/TTS/tts/layers/tacotron/gst_layers.py b/TTS/tts/layers/tacotron/gst_layers.py index e2784e5d..02154093 100644 --- a/TTS/tts/layers/tacotron/gst_layers.py +++ b/TTS/tts/layers/tacotron/gst_layers.py @@ -8,10 +8,10 @@ class GST(nn.Module): See https://arxiv.org/pdf/1803.09017""" - def __init__(self, num_mel, num_heads, num_style_tokens, gst_embedding_dim, speaker_embedding_dim=None): + def __init__(self, num_mel, num_heads, num_style_tokens, gst_embedding_dim, d_vector_dim=None): super().__init__() self.encoder = ReferenceEncoder(num_mel, gst_embedding_dim) - self.style_token_layer = StyleTokenLayer(num_heads, num_style_tokens, gst_embedding_dim, speaker_embedding_dim) + self.style_token_layer = StyleTokenLayer(num_heads, num_style_tokens, gst_embedding_dim, d_vector_dim) def forward(self, inputs, speaker_embedding=None): enc_out = self.encoder(inputs) @@ -83,13 +83,13 @@ class ReferenceEncoder(nn.Module): class StyleTokenLayer(nn.Module): """NN Module attending to style tokens based on prosody encodings.""" - def __init__(self, num_heads, num_style_tokens, embedding_dim, speaker_embedding_dim=None): + def __init__(self, num_heads, num_style_tokens, embedding_dim, d_vector_dim=None): super().__init__() self.query_dim = embedding_dim // 2 - if speaker_embedding_dim: - self.query_dim += speaker_embedding_dim + if d_vector_dim: + self.query_dim += d_vector_dim self.key_dim = embedding_dim // num_heads self.style_tokens = nn.Parameter(torch.FloatTensor(num_style_tokens, self.key_dim)) diff --git a/TTS/tts/layers/tacotron/tacotron.py b/TTS/tts/layers/tacotron/tacotron.py index dc38173f..2f94db88 100644 --- a/TTS/tts/layers/tacotron/tacotron.py +++ b/TTS/tts/layers/tacotron/tacotron.py @@ -266,7 +266,7 @@ class Decoder(nn.Module): location_attn (bool): if true, use location sensitive attention. attn_K (int): number of attention heads for GravesAttention. separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow. - speaker_embedding_dim (int): size of speaker embedding vector, for multi-speaker training. + d_vector_dim (int): size of speaker embedding vector, for multi-speaker training. """ # Pylint gets confused by PyTorch conventions here diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py index 153f8d43..026f5c85 100644 --- a/TTS/tts/models/__init__.py +++ b/TTS/tts/models/__init__.py @@ -1,7 +1,7 @@ from TTS.utils.generic_utils import find_module -def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): +def setup_model(num_chars, num_speakers, c, d_vector_dim=None): print(" > Using model: {}".format(c.model)) MyModel = find_module("TTS.tts.models", c.model.lower()) if c.model.lower() in "tacotron": @@ -29,7 +29,7 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): bidirectional_decoder=c.bidirectional_decoder, double_decoder_consistency=c.double_decoder_consistency, ddc_r=c.ddc_r, - speaker_embedding_dim=speaker_embedding_dim, + d_vector_dim=d_vector_dim, ) elif c.model.lower() == "tacotron2": model = MyModel( @@ -55,7 +55,7 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): bidirectional_decoder=c.bidirectional_decoder, double_decoder_consistency=c.double_decoder_consistency, ddc_r=c.ddc_r, - speaker_embedding_dim=speaker_embedding_dim, + d_vector_dim=d_vector_dim, ) elif c.model.lower() == "glow_tts": model = MyModel( @@ -79,7 +79,7 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): num_squeeze=2, sigmoid_scale=False, mean_only=True, - speaker_embedding_dim=speaker_embedding_dim, + d_vector_dim=d_vector_dim, ) elif c.model.lower() == "speedy_speech": model = MyModel( diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 3e8d4adc..20b0cdf7 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -212,7 +212,7 @@ class AlignTTS(nn.Module): return dr_mas, mu, log_sigma, logp def forward( - self, x, x_lengths, y, y_lengths, cond_input={"x_vectors": None}, phase=None + self, x, x_lengths, y, y_lengths, cond_input={"d_vectors": None}, phase=None ): # pylint: disable=unused-argument """ Shapes: @@ -223,7 +223,7 @@ class AlignTTS(nn.Module): g: [B, C] """ y = y.transpose(1, 2) - g = cond_input["x_vectors"] if "x_vectors" in cond_input else None + g = cond_input["d_vectors"] if "d_vectors" in cond_input else None o_de, o_dr_log, dr_mas_log, attn, mu, log_sigma, logp = None, None, None, None, None, None, None if phase == 0: # train encoder and MDN @@ -267,14 +267,14 @@ class AlignTTS(nn.Module): return outputs @torch.no_grad() - def inference(self, x, cond_input={"x_vectors": None}): # pylint: disable=unused-argument + def inference(self, x, cond_input={"d_vectors": None}): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] x_lengths: [B] g: [B, C] """ - g = cond_input["x_vectors"] if "x_vectors" in cond_input else None + g = cond_input["d_vectors"] if "d_vectors" in cond_input else None x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # pad input to prevent dropping the last word # x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0) @@ -293,10 +293,10 @@ class AlignTTS(nn.Module): text_lengths = batch["text_lengths"] mel_input = batch["mel_input"] mel_lengths = batch["mel_lengths"] - x_vectors = batch["x_vectors"] + d_vectors = batch["d_vectors"] speaker_ids = batch["speaker_ids"] - cond_input = {"x_vectors": x_vectors, "speaker_ids": speaker_ids} + cond_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids} outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input, self.phase) loss_dict = criterion( outputs["logp"], diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index af52ba1c..9c928a67 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -36,7 +36,7 @@ class GlowTTS(nn.Module): mean_only (bool): if True, encoder only computes mean value and uses constant variance for each time step. encoder_type (str): encoder module type. encoder_params (dict): encoder module parameters. - speaker_embedding_dim (int): channels of external speaker embedding vectors. + d_vector_dim (int): channels of external speaker embedding vectors. """ def __init__( @@ -62,7 +62,7 @@ class GlowTTS(nn.Module): mean_only=False, encoder_type="transformer", encoder_params=None, - speaker_embedding_dim=None, + d_vector_dim=None, ): super().__init__() @@ -88,15 +88,15 @@ class GlowTTS(nn.Module): # model constants. self.noise_scale = 0.33 # defines the noise variance applied to the random z vector at inference. self.length_scale = 1.0 # scaler for the duration predictor. The larger it is, the slower the speech. - self.speaker_embedding_dim = speaker_embedding_dim + self.d_vector_dim = d_vector_dim # if is a multispeaker and c_in_channels is 0, set to 256 if num_speakers > 1: - if self.c_in_channels == 0 and not self.speaker_embedding_dim: + if self.c_in_channels == 0 and not self.d_vector_dim: # TODO: make this adjustable self.c_in_channels = 256 - elif self.speaker_embedding_dim: - self.c_in_channels = self.speaker_embedding_dim + elif self.d_vector_dim: + self.c_in_channels = self.d_vector_dim self.encoder = Encoder( num_chars, @@ -125,7 +125,7 @@ class GlowTTS(nn.Module): c_in_channels=self.c_in_channels, ) - if num_speakers > 1 and not speaker_embedding_dim: + if num_speakers > 1 and not d_vector_dim: # speaker embedding layer self.emb_g = nn.Embedding(num_speakers, self.c_in_channels) nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) @@ -144,7 +144,7 @@ class GlowTTS(nn.Module): return y_mean, y_log_scale, o_attn_dur def forward( - self, x, x_lengths, y, y_lengths=None, cond_input={"x_vectors": None} + self, x, x_lengths, y, y_lengths=None, cond_input={"d_vectors": None} ): # pylint: disable=dangerous-default-value """ Shapes: @@ -157,9 +157,9 @@ class GlowTTS(nn.Module): y = y.transpose(1, 2) y_max_length = y.size(2) # norm speaker embeddings - g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None + g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None if g is not None: - if self.speaker_embedding_dim: + if self.d_vector_dim: g = F.normalize(g).unsqueeze(-1) else: g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] @@ -197,7 +197,7 @@ class GlowTTS(nn.Module): @torch.no_grad() def inference_with_MAS( - self, x, x_lengths, y=None, y_lengths=None, cond_input={"x_vectors": None} + self, x, x_lengths, y=None, y_lengths=None, cond_input={"d_vectors": None} ): # pylint: disable=dangerous-default-value """ It's similar to the teacher forcing in Tacotron. @@ -212,9 +212,9 @@ class GlowTTS(nn.Module): y = y.transpose(1, 2) y_max_length = y.size(2) # norm speaker embeddings - g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None + g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None if g is not None: - if self.external_speaker_embedding_dim: + if self.external_d_vector_dim: g = F.normalize(g).unsqueeze(-1) else: g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] @@ -258,7 +258,7 @@ class GlowTTS(nn.Module): @torch.no_grad() def decoder_inference( - self, y, y_lengths=None, cond_input={"x_vectors": None} + self, y, y_lengths=None, cond_input={"d_vectors": None} ): # pylint: disable=dangerous-default-value """ Shapes: @@ -268,10 +268,10 @@ class GlowTTS(nn.Module): """ y = y.transpose(1, 2) y_max_length = y.size(2) - g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None + g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None # norm speaker embeddings if g is not None: - if self.external_speaker_embedding_dim: + if self.external_d_vector_dim: g = F.normalize(g).unsqueeze(-1) else: g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] @@ -290,10 +290,10 @@ class GlowTTS(nn.Module): return outputs @torch.no_grad() - def inference(self, x, x_lengths, cond_input={"x_vectors": None}): # pylint: disable=dangerous-default-value - g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None + def inference(self, x, x_lengths, cond_input={"d_vectors": None}): # pylint: disable=dangerous-default-value + g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None if g is not None: - if self.speaker_embedding_dim: + if self.d_vector_dim: g = F.normalize(g).unsqueeze(-1) else: g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h] @@ -338,9 +338,9 @@ class GlowTTS(nn.Module): text_lengths = batch["text_lengths"] mel_input = batch["mel_input"] mel_lengths = batch["mel_lengths"] - x_vectors = batch["x_vectors"] + d_vectors = batch["d_vectors"] - outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input={"x_vectors": x_vectors}) + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input={"d_vectors": d_vectors}) loss_dict = criterion( outputs["model_outputs"], diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index 455dbf38..53f7bbaa 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -157,7 +157,7 @@ class SpeedySpeech(nn.Module): return o_de, attn.transpose(1, 2) def forward( - self, x, x_lengths, y_lengths, dr, cond_input={"x_vectors": None, "speaker_ids": None} + self, x, x_lengths, y_lengths, dr, cond_input={"d_vectors": None, "speaker_ids": None} ): # pylint: disable=unused-argument """ TODO: speaker embedding for speaker_ids @@ -168,21 +168,21 @@ class SpeedySpeech(nn.Module): dr: [B, T_max] g: [B, C] """ - g = cond_input["x_vectors"] if "x_vectors" in cond_input else None + g = cond_input["d_vectors"] if "d_vectors" in cond_input else None o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) o_de, attn = self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g) outputs = {"model_outputs": o_de.transpose(1, 2), "durations_log": o_dr_log.squeeze(1), "alignments": attn} return outputs - def inference(self, x, cond_input={"x_vectors": None, "speaker_ids": None}): # pylint: disable=unused-argument + def inference(self, x, cond_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] x_lengths: [B] g: [B, C] """ - g = cond_input["x_vectors"] if "x_vectors" in cond_input else None + g = cond_input["d_vectors"] if "d_vectors" in cond_input else None x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # input sequence should be greated than the max convolution size inference_padding = 5 @@ -204,11 +204,11 @@ class SpeedySpeech(nn.Module): text_lengths = batch["text_lengths"] mel_input = batch["mel_input"] mel_lengths = batch["mel_lengths"] - x_vectors = batch["x_vectors"] + d_vectors = batch["d_vectors"] speaker_ids = batch["speaker_ids"] durations = batch["durations"] - cond_input = {"x_vectors": x_vectors, "speaker_ids": speaker_ids} + cond_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids} outputs = self.forward(text_input, text_lengths, mel_lengths, durations, cond_input) # compute loss diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 12c3e5f9..123b69a7 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -42,7 +42,7 @@ class Tacotron(TacotronAbstract): ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None. encoder_in_features (int, optional): input channels for the encoder. Defaults to 512. decoder_in_features (int, optional): input channels for the decoder. Defaults to 512. - speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None. + d_vector_dim (int, optional): external speaker conditioning vector channels. Defaults to None. use_gst (bool, optional): enable/disable Global style token module. gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. memory_size (int, optional): size of the history queue fed to the prenet. Model feeds the last ```memory_size``` @@ -75,7 +75,7 @@ class Tacotron(TacotronAbstract): ddc_r=None, encoder_in_features=256, decoder_in_features=256, - speaker_embedding_dim=None, + d_vector_dim=None, use_gst=False, gst=None, memory_size=5, @@ -104,7 +104,7 @@ class Tacotron(TacotronAbstract): ddc_r, encoder_in_features, decoder_in_features, - speaker_embedding_dim, + d_vector_dim, use_gst, gst, gradual_training, @@ -112,14 +112,14 @@ class Tacotron(TacotronAbstract): # speaker embedding layers if self.num_speakers > 1: - if not self.embeddings_per_sample: - speaker_embedding_dim = 256 - self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) + if not self.use_d_vectors: + d_vector_dim = 256 + self.speaker_embedding = nn.Embedding(self.num_speakers, d_vector_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) # speaker and gst embeddings is concat in decoder input if self.num_speakers > 1: - self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim + self.decoder_in_features += d_vector_dim # add speaker embedding dim # embedding layer self.embedding = nn.Embedding(num_chars, 256, padding_idx=0) @@ -154,7 +154,7 @@ class Tacotron(TacotronAbstract): if self.gst and self.use_gst: self.gst_layer = GST( num_mel=decoder_output_dim, - speaker_embedding_dim=speaker_embedding_dim, + d_vector_dim=d_vector_dim, num_heads=gst.gst_num_heads, num_style_tokens=gst.gst_num_style_tokens, gst_embedding_dim=gst.gst_embedding_dim, @@ -189,7 +189,7 @@ class Tacotron(TacotronAbstract): text_lengths: [B] mel_specs: [B, T_out, C] mel_lengths: [B] - cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] + cond_input: 'speaker_ids': [B, 1] and 'd_vectors':[B, C] """ outputs = {"alignments_backward": None, "decoder_outputs_backward": None} inputs = self.embedding(text) @@ -201,16 +201,16 @@ class Tacotron(TacotronAbstract): # global style token if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["x_vectors"]) + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["d_vectors"]) # speaker embedding if self.num_speakers > 1: - if not self.embeddings_per_sample: + if not self.use_d_vectors: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(cond_input["speaker_ids"])[:, None] + embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[:, None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(cond_input["x_vectors"], 1) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + embedded_speakers = torch.unsqueeze(cond_input["d_vectors"], 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) # decoder_outputs: B x decoder_in_features x T_out # alignments: B x T_in x encoder_in_features # stop_tokens: B x T_in @@ -254,15 +254,15 @@ class Tacotron(TacotronAbstract): encoder_outputs = self.encoder(inputs) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["x_vectors"]) + encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["d_vectors"]) if self.num_speakers > 1: - if not self.embeddings_per_sample: + if not self.use_d_vectors: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(cond_input["speaker_ids"])[:, None] + embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[:, None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(cond_input["x_vectors"], 1) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + embedded_speakers = torch.unsqueeze(cond_input["d_vectors"], 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = self.last_linear(postnet_outputs) @@ -289,7 +289,7 @@ class Tacotron(TacotronAbstract): linear_input = batch["linear_input"] stop_targets = batch["stop_targets"] speaker_ids = batch["speaker_ids"] - x_vectors = batch["x_vectors"] + d_vectors = batch["d_vectors"] # forward pass model outputs = self.forward( @@ -297,7 +297,7 @@ class Tacotron(TacotronAbstract): text_lengths, mel_input, mel_lengths, - cond_input={"speaker_ids": speaker_ids, "x_vectors": x_vectors}, + cond_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors}, ) # set the [alignment] lengths wrt reduction factor for guided attention @@ -308,7 +308,7 @@ class Tacotron(TacotronAbstract): else: alignment_lengths = mel_lengths // self.decoder.r - cond_input = {"speaker_ids": speaker_ids, "x_vectors": x_vectors} + cond_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input) # compute loss diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 68867ec8..4628c64e 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -42,7 +42,7 @@ class Tacotron2(TacotronAbstract): ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None. encoder_in_features (int, optional): input channels for the encoder. Defaults to 512. decoder_in_features (int, optional): input channels for the decoder. Defaults to 512. - speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None. + d_vector_dim (int, optional): external speaker conditioning vector channels. Defaults to None. use_gst (bool, optional): enable/disable Global style token module. gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. gradual_training (List): Gradual training schedule. If None or `[]`, no gradual training is used. @@ -73,7 +73,7 @@ class Tacotron2(TacotronAbstract): ddc_r=None, encoder_in_features=512, decoder_in_features=512, - speaker_embedding_dim=None, + d_vector_dim=None, use_gst=False, gst=None, gradual_training=None, @@ -101,7 +101,7 @@ class Tacotron2(TacotronAbstract): ddc_r, encoder_in_features, decoder_in_features, - speaker_embedding_dim, + d_vector_dim, use_gst, gst, gradual_training, @@ -109,14 +109,14 @@ class Tacotron2(TacotronAbstract): # speaker embedding layer if self.num_speakers > 1: - if not self.embeddings_per_sample: - speaker_embedding_dim = 512 - self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) + if not self.use_d_vectors: + d_vector_dim = 512 + self.speaker_embedding = nn.Embedding(self.num_speakers, d_vector_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) # speaker and gst embeddings is concat in decoder input if self.num_speakers > 1: - self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim + self.decoder_in_features += d_vector_dim # add speaker embedding dim # embedding layer self.embedding = nn.Embedding(num_chars, 512, padding_idx=0) @@ -142,13 +142,13 @@ class Tacotron2(TacotronAbstract): self.postnet = Postnet(self.postnet_output_dim) # setup prenet dropout - self.decoder.prenet.dropout_at_inference = prenet_dropout_at_inference + self.decoder.prenet.dropout_at_g = prenet_dropout_at_inference # global style token layers if self.gst and use_gst: self.gst_layer = GST( num_mel=decoder_output_dim, - speaker_embedding_dim=speaker_embedding_dim, + d_vector_dim=d_vector_dim, num_heads=gst.gst_num_heads, num_style_tokens=gst.gst_num_style_tokens, gst_embedding_dim=gst.gst_embedding_dim, @@ -189,7 +189,7 @@ class Tacotron2(TacotronAbstract): text_lengths: [B] mel_specs: [B, T_out, C] mel_lengths: [B] - cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] + cond_input: 'speaker_ids': [B, 1] and 'd_vectors':[B, C] """ cond_input = self._format_cond_input(cond_input) outputs = {"alignments_backward": None, "decoder_outputs_backward": None} @@ -202,15 +202,15 @@ class Tacotron2(TacotronAbstract): encoder_outputs = self.encoder(embedded_inputs, text_lengths) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["x_vectors"]) + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["d_vectors"]) if self.num_speakers > 1: - if not self.embeddings_per_sample: + if not self.use_d_vectors: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(cond_input["speaker_ids"])[:, None] + embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[:, None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(cond_input["x_vectors"], 1) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + embedded_speakers = torch.unsqueeze(cond_input["d_vectors"], 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) @@ -255,15 +255,15 @@ class Tacotron2(TacotronAbstract): if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["x_vectors"]) + encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["d_vectors"]) if self.num_speakers > 1: if not self.embeddings_per_sample: x_vector = self.speaker_embedding(cond_input['speaker_ids'])[:, None] x_vector = torch.unsqueeze(x_vector, 0).transpose(1, 2) else: - x_vector = cond_input["x_vectors"] + embedded_speakers = cond_input["d_vectors"] - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, x_vector) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) postnet_outputs = self.postnet(decoder_outputs) @@ -291,7 +291,7 @@ class Tacotron2(TacotronAbstract): linear_input = batch["linear_input"] stop_targets = batch["stop_targets"] speaker_ids = batch["speaker_ids"] - x_vectors = batch["x_vectors"] + d_vectors = batch["d_vectors"] # forward pass model outputs = self.forward( @@ -299,7 +299,7 @@ class Tacotron2(TacotronAbstract): text_lengths, mel_input, mel_lengths, - cond_input={"speaker_ids": speaker_ids, "x_vectors": x_vectors}, + cond_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors}, ) # set the [alignment] lengths wrt reduction factor for guided attention @@ -310,7 +310,7 @@ class Tacotron2(TacotronAbstract): else: alignment_lengths = mel_lengths // self.decoder.r - cond_input = {"speaker_ids": speaker_ids, "x_vectors": x_vectors} + cond_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input) # compute loss diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py index fe43d81f..e480e2e0 100644 --- a/TTS/tts/models/tacotron_abstract.py +++ b/TTS/tts/models/tacotron_abstract.py @@ -35,7 +35,7 @@ class TacotronAbstract(ABC, nn.Module): ddc_r=None, encoder_in_features=512, decoder_in_features=512, - speaker_embedding_dim=None, + d_vector_dim=None, use_gst=False, gst=None, gradual_training=None, @@ -66,7 +66,7 @@ class TacotronAbstract(ABC, nn.Module): self.separate_stopnet = separate_stopnet self.encoder_in_features = encoder_in_features self.decoder_in_features = decoder_in_features - self.speaker_embedding_dim = speaker_embedding_dim + self.d_vector_dim = d_vector_dim self.gradual_training = gradual_training # layers @@ -76,12 +76,12 @@ class TacotronAbstract(ABC, nn.Module): self.postnet = None # multispeaker - if self.speaker_embedding_dim is None: - # if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim - self.embeddings_per_sample = False + if self.d_vector_dim is None: + # if d_vector_dim is None we need use the nn.Embedding, with default d_vector_dim + self.use_d_vectors = False else: - # if speaker_embedding_dim is not None we need use speaker embedding per sample - self.embeddings_per_sample = True + # if d_vector_dim is not None we need use speaker embedding per sample + self.use_d_vectors = True # global style token if self.gst and use_gst: @@ -89,8 +89,8 @@ class TacotronAbstract(ABC, nn.Module): self.gst_layer = None # model states - self.speaker_embeddings = None - self.speaker_embeddings_projected = None + self.embedded_speakers = None + self.embedded_speakers_projected = None # additional layers self.decoder_backward = None @@ -98,15 +98,15 @@ class TacotronAbstract(ABC, nn.Module): @staticmethod def _format_cond_input(cond_input: Dict) -> Dict: - return format_cond_input({"x_vectors": None, "speaker_ids": None}, cond_input) + return format_cond_input({"d_vectors": None, "speaker_ids": None}, cond_input) ############################# # INIT FUNCTIONS ############################# def _init_states(self): - self.speaker_embeddings = None - self.speaker_embeddings_projected = None + self.embedded_speakers = None + self.embedded_speakers_projected = None def _init_backward_decoder(self): self.decoder_backward = copy.deepcopy(self.decoder) @@ -188,9 +188,9 @@ class TacotronAbstract(ABC, nn.Module): if hasattr(self, "speaker_embedding") and speaker_ids is None: raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided") if hasattr(self, "speaker_embedding") and speaker_ids is not None: - self.speaker_embeddings = self.speaker_embedding(speaker_ids).unsqueeze(1) + self.embedded_speakers = self.speaker_embedding(speaker_ids).unsqueeze(1) if hasattr(self, "speaker_project_mel") and speaker_ids is not None: - self.speaker_embeddings_projected = self.speaker_project_mel(self.speaker_embeddings).squeeze(1) + self.embedded_speakers_projected = self.speaker_project_mel(self.embedded_speakers).squeeze(1) def compute_gst(self, inputs, style_input, speaker_embedding=None): """Compute global style token""" @@ -213,15 +213,15 @@ class TacotronAbstract(ABC, nn.Module): return inputs @staticmethod - def _add_speaker_embedding(outputs, speaker_embeddings): - speaker_embeddings_ = speaker_embeddings.expand(outputs.size(0), outputs.size(1), -1) - outputs = outputs + speaker_embeddings_ + def _add_speaker_embedding(outputs, embedded_speakers): + embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1) + outputs = outputs + embedded_speakers_ return outputs @staticmethod - def _concat_speaker_embedding(outputs, speaker_embeddings): - speaker_embeddings_ = speaker_embeddings.expand(outputs.size(0), outputs.size(1), -1) - outputs = torch.cat([outputs, speaker_embeddings_], dim=-1) + def _concat_speaker_embedding(outputs, embedded_speakers): + embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1) + outputs = torch.cat([outputs, embedded_speakers_], dim=-1) return outputs ############################# diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index cebf0dca..546d483d 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -52,8 +52,8 @@ def get_speaker_manager(c, args, meta_data_train): raise RuntimeError( "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file" ) - speaker_manager.load_x_vectors_file(c.external_speaker_embedding_file) - speaker_manager.set_x_vectors_from_file(speakers_file) + speaker_manager.load_d_vectors_file(c.external_speaker_embedding_file) + speaker_manager.set_d_vectors_from_file(speakers_file) elif not c.use_external_speaker_embedding_file: # restor speaker manager with speaker ID file. speakers_file = os.path.dirname(args.restore_path) speaker_ids_from_data = speaker_manager.speaker_ids @@ -63,7 +63,7 @@ def get_speaker_manager(c, args, meta_data_train): ), " [!] You cannot introduce new speakers to a pre-trained model." elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # new speaker manager with external speaker embeddings. - speaker_manager.set_x_vectors_from_file(c.external_speaker_embedding_file) + speaker_manager.set_d_vectors_from_file(c.external_speaker_embedding_file) elif ( c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file ): # new speaker manager with speaker IDs file. @@ -88,7 +88,7 @@ class SpeakerManager: { 'clip_name.wav':{ 'name': 'speakerA', - 'embedding'[] + 'embedding'[] }, ... } @@ -103,10 +103,10 @@ class SpeakerManager: >>> # load a sample audio and compute embedding >>> waveform = ap.load_wav(sample_wav_path) >>> mel = ap.melspectrogram(waveform) - >>> x_vector = manager.compute_x_vector(mel.T) + >>> d_vector = manager.compute_d_vector(mel.T) Args: - x_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "". + d_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "". speaker_id_file_path (str, optional): Path to the metafile that maps speaker names to ids used by TTS models. Defaults to "". encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "". @@ -116,15 +116,15 @@ class SpeakerManager: def __init__( self, data_items: List[List[Any]] = None, - x_vectors_file_path: str = "", + d_vectors_file_path: str = "", speaker_id_file_path: str = "", encoder_model_path: str = "", encoder_config_path: str = "", ): self.data_items = [] - self.x_vectors = {} - self.speaker_ids = [] + self.d_vectors = {} + self.speaker_ids = {} self.clip_ids = [] self.speaker_encoder = None self.speaker_encoder_ap = None @@ -132,8 +132,8 @@ class SpeakerManager: if data_items: self.speaker_ids, _ = self.parse_speakers_from_data(self.data_items) - if x_vectors_file_path: - self.set_x_vectors_from_file(x_vectors_file_path) + if d_vectors_file_path: + self.set_d_vectors_from_file(d_vectors_file_path) if speaker_id_file_path: self.set_speaker_ids_from_file(speaker_id_file_path) @@ -156,10 +156,10 @@ class SpeakerManager: return len(self.speaker_ids) @property - def x_vector_dim(self): - """Dimensionality of x_vectors. If x_vectors are not loaded, returns zero.""" - if self.x_vectors: - return len(self.x_vectors[list(self.x_vectors.keys())[0]]["embedding"]) + def d_vector_dim(self): + """Dimensionality of d_vectors. If d_vectors are not loaded, returns zero.""" + if self.d_vectors: + return len(self.d_vectors[list(self.d_vectors.keys())[0]]["embedding"]) return 0 @staticmethod @@ -201,73 +201,73 @@ class SpeakerManager: """ self._save_json(file_path, self.speaker_ids) - def save_x_vectors_to_file(self, file_path: str) -> None: - """Save x_vectors to a json file. + def save_d_vectors_to_file(self, file_path: str) -> None: + """Save d_vectors to a json file. Args: file_path (str): Path to the output file. """ - self._save_json(file_path, self.x_vectors) + self._save_json(file_path, self.d_vectors) - def set_x_vectors_from_file(self, file_path: str) -> None: - """Load x_vectors from a json file. + def set_d_vectors_from_file(self, file_path: str) -> None: + """Load d_vectors from a json file. Args: file_path (str): Path to the target json file. """ - self.x_vectors = self._load_json(file_path) - self.speaker_ids = list(set(sorted(x["name"] for x in self.x_vectors.values()))) - self.clip_ids = list(set(sorted(clip_name for clip_name in self.x_vectors.keys()))) + self.d_vectors = self._load_json(file_path) + self.speaker_ids = list(set(sorted(x["name"] for x in self.d_vectors.values()))) + self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys()))) - def get_x_vector_by_clip(self, clip_idx: str) -> List: - """Get x_vector by clip ID. + def get_d_vector_by_clip(self, clip_idx: str) -> List: + """Get d_vector by clip ID. Args: clip_idx (str): Target clip ID. Returns: - List: x_vector as a list. + List: d_vector as a list. """ - return self.x_vectors[clip_idx]["embedding"] + return self.d_vectors[clip_idx]["embedding"] - def get_x_vectors_by_speaker(self, speaker_idx: str) -> List[List]: - """Get all x_vectors of a speaker. + def get_d_vectors_by_speaker(self, speaker_idx: str) -> List[List]: + """Get all d_vectors of a speaker. Args: speaker_idx (str): Target speaker ID. Returns: - List[List]: all the x_vectors of the given speaker. + List[List]: all the d_vectors of the given speaker. """ - return [x["embedding"] for x in self.x_vectors.values() if x["name"] == speaker_idx] + return [x["embedding"] for x in self.d_vectors.values() if x["name"] == speaker_idx] - def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: - """Get mean x_vector of a speaker ID. + def get_mean_d_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: + """Get mean d_vector of a speaker ID. Args: speaker_idx (str): Target speaker ID. num_samples (int, optional): Number of samples to be averaged. Defaults to None. - randomize (bool, optional): Pick random `num_samples`of x_vectors. Defaults to False. + randomize (bool, optional): Pick random `num_samples`of d_vectors. Defaults to False. Returns: - np.ndarray: Mean x_vector. + np.ndarray: Mean d_vector. """ - x_vectors = self.get_x_vectors_by_speaker(speaker_idx) + d_vectors = self.get_d_vectors_by_speaker(speaker_idx) if num_samples is None: - x_vectors = np.stack(x_vectors).mean(0) + d_vectors = np.stack(d_vectors).mean(0) else: - assert len(x_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}" + assert len(d_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}" if randomize: - x_vectors = np.stack(random.choices(x_vectors, k=num_samples)).mean(0) + d_vectors = np.stack(random.choices(d_vectors, k=num_samples)).mean(0) else: - x_vectors = np.stack(x_vectors[:num_samples]).mean(0) - return x_vectors + d_vectors = np.stack(d_vectors[:num_samples]).mean(0) + return d_vectors def get_speakers(self) -> List: return self.speaker_ids def get_clips(self) -> List: - return sorted(self.x_vectors.keys()) + return sorted(self.d_vectors.keys()) def init_speaker_encoder(self, model_path: str, config_path: str) -> None: """Initialize a speaker encoder model. @@ -284,14 +284,14 @@ class SpeakerManager: self.speaker_encoder_ap.do_sound_norm = True self.speaker_encoder_ap.do_trim_silence = True - def compute_x_vector_from_clip(self, wav_file: Union[str, list]) -> list: - """Compute a x_vector from a given audio file. + def compute_d_vector_from_clip(self, wav_file: Union[str, list]) -> list: + """Compute a d_vector from a given audio file. Args: wav_file (Union[str, list]): Target file path. Returns: - list: Computed x_vector. + list: Computed d_vector. """ def _compute(wav_file: str): @@ -299,30 +299,30 @@ class SpeakerManager: spec = self.speaker_encoder_ap.melspectrogram(waveform) spec = torch.from_numpy(spec.T) spec = spec.unsqueeze(0) - x_vector = self.speaker_encoder.compute_embedding(spec) - return x_vector + d_vector = self.speaker_encoder.compute_embedding(spec) + return d_vector if isinstance(wav_file, list): - # compute the mean x_vector - x_vectors = None + # compute the mean d_vector + d_vectors = None for wf in wav_file: - x_vector = _compute(wf) - if x_vectors is None: - x_vectors = x_vector + d_vector = _compute(wf) + if d_vectors is None: + d_vectors = d_vector else: - x_vectors += x_vector - return (x_vectors / len(wav_file))[0].tolist() - x_vector = _compute(wav_file) - return x_vector[0].tolist() + d_vectors += d_vector + return (d_vectors / len(wav_file))[0].tolist() + d_vector = _compute(wav_file) + return d_vector[0].tolist() - def compute_x_vector(self, feats: Union[torch.Tensor, np.ndarray]) -> List: - """Compute x_vector from features. + def compute_d_vector(self, feats: Union[torch.Tensor, np.ndarray]) -> List: + """Compute d_vector from features. Args: feats (Union[torch.Tensor, np.ndarray]): Input features. Returns: - List: computed x_vector. + List: computed d_vector. """ if isinstance(feats, np.ndarray): feats = torch.from_numpy(feats) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 7328ddae..04fef715 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -65,9 +65,9 @@ def compute_style_mel(style_wav, ap, cuda=False): return style_mel -def run_model_torch(model, inputs, speaker_id=None, style_mel=None, x_vector=None): +def run_model_torch(model, inputs, speaker_id=None, style_mel=None, d_vector=None): outputs = model.inference( - inputs, cond_input={"speaker_ids": speaker_id, "x_vector": x_vector, "style_mel": style_mel} + inputs, cond_input={"speaker_ids": speaker_id, "d_vector": d_vector, "style_mel": style_mel} ) return outputs @@ -140,13 +140,13 @@ def speaker_id_to_torch(speaker_id, cuda=False): return speaker_id -def embedding_to_torch(x_vector, cuda=False): - if x_vector is not None: - x_vector = np.asarray(x_vector) - x_vector = torch.from_numpy(x_vector).unsqueeze(0).type(torch.FloatTensor) +def embedding_to_torch(d_vector, cuda=False): + if d_vector is not None: + d_vector = np.asarray(d_vector) + d_vector = torch.from_numpy(d_vector).unsqueeze(0).type(torch.FloatTensor) if cuda: - return x_vector.cuda() - return x_vector + return d_vector.cuda() + return d_vector # TODO: perform GL with pytorch for batching @@ -178,7 +178,7 @@ def synthesis( enable_eos_bos_chars=False, # pylint: disable=unused-argument use_griffin_lim=False, do_trim_silence=False, - x_vector=None, + d_vector=None, backend="torch", ): """Synthesize voice for the given text. @@ -210,8 +210,8 @@ def synthesis( if speaker_id is not None: speaker_id = speaker_id_to_torch(speaker_id, cuda=use_cuda) - if x_vector is not None: - x_vector = embedding_to_torch(x_vector, cuda=use_cuda) + if d_vector is not None: + d_vector = embedding_to_torch(d_vector, cuda=use_cuda) if not isinstance(style_mel, dict): style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) @@ -228,7 +228,7 @@ def synthesis( text_inputs = tf.expand_dims(text_inputs, 0) # synthesize voice if backend == "torch": - outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, x_vector=x_vector) + outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, d_vector=d_vector) model_outputs = outputs["model_outputs"] model_outputs = model_outputs[0].data.cpu().numpy() alignments = outputs["alignments"] diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index a31436d4..8f510f20 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -63,7 +63,7 @@ class Synthesizer(object): self.speaker_manager = None self.num_speakers = 0 self.tts_speakers = {} - self.speaker_embedding_dim = 0 + self.d_vector_dim = 0 self.seg = self._get_segmenter("en") self.use_cuda = use_cuda @@ -98,9 +98,9 @@ class Synthesizer(object): self.speaker_manager = SpeakerManager( encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config ) - self.speaker_manager.load_x_vectors_file(self.tts_config.get("external_speaker_embedding_file", speaker_file)) + self.speaker_manager.load_d_vectors_file(self.tts_config.get("external_speaker_embedding_file", speaker_file)) self.num_speakers = self.speaker_manager.num_speakers - self.speaker_embedding_dim = self.speaker_manager.x_vector_dim + self.d_vector_dim = self.speaker_manager.d_vector_dim def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None: """Load the TTS model. @@ -135,7 +135,7 @@ class Synthesizer(object): self.input_size, num_speakers=self.num_speakers, c=self.tts_config, - speaker_embedding_dim=self.speaker_embedding_dim, + d_vector_dim=self.d_vector_dim, ) self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: @@ -197,9 +197,9 @@ class Synthesizer(object): print(sens) if self.tts_speakers_file: - # get the speaker embedding from the saved x_vectors. + # get the speaker embedding from the saved d_vectors. if speaker_idx and isinstance(speaker_idx, str): - speaker_embedding = self.speaker_manager.get_x_vectors_by_speaker(speaker_idx)[0] + speaker_embedding = self.speaker_manager.get_d_vectors_by_speaker(speaker_idx)[0] elif not speaker_idx and not speaker_wav: raise ValueError( " [!] Look like you use a multi-speaker model. " @@ -214,9 +214,9 @@ class Synthesizer(object): "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. " ) - # compute a new x_vector from the given clip. + # compute a new d_vector from the given clip. if speaker_wav is not None: - speaker_embedding = self.speaker_manager.compute_x_vector_from_clip(speaker_wav) + speaker_embedding = self.speaker_manager.compute_d_vector_from_clip(speaker_wav) use_gl = self.vocoder_model is None @@ -232,7 +232,7 @@ class Synthesizer(object): style_wav=style_wav, enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars, use_griffin_lim=use_gl, - x_vector=speaker_embedding, + d_vector=speaker_embedding, ) waveform = outputs["wav"] mel_postnet_spec = outputs["model_outputs"] diff --git a/tests/test_extract_tts_spectrograms.py b/tests/test_extract_tts_spectrograms.py index ddc7e4da..d16167ed 100644 --- a/tests/test_extract_tts_spectrograms.py +++ b/tests/test_extract_tts_spectrograms.py @@ -22,7 +22,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): c = load_config(config_path) # create model num_chars = len(phonemes if c.use_phonemes else symbols) - model = setup_model(num_chars, 1, c, speaker_embedding_dim=None) + model = setup_model(num_chars, 1, c, d_vector_dim=None) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test @@ -41,7 +41,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): c = load_config(config_path) # create model num_chars = len(phonemes if c.use_phonemes else symbols) - model = setup_model(num_chars, 1, c, speaker_embedding_dim=None) + model = setup_model(num_chars, 1, c, d_vector_dim=None) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test @@ -60,7 +60,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): c = load_config(config_path) # create model num_chars = len(phonemes if c.use_phonemes else symbols) - model = setup_model(num_chars, 1, c, speaker_embedding_dim=None) + model = setup_model(num_chars, 1, c, d_vector_dim=None) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test diff --git a/tests/test_speaker_manager.py b/tests/test_speaker_manager.py index f80e56fc..a695fe61 100644 --- a/tests/test_speaker_manager.py +++ b/tests/test_speaker_manager.py @@ -15,11 +15,11 @@ encoder_config_path = os.path.join(get_tests_input_path(), "test_speaker_encoder encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth.tar") sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav") sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav") -x_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json") +d_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json") class SpeakerManagerTest(unittest.TestCase): - """Test SpeakerManager for loading embedding files and computing x_vectors from waveforms""" + """Test SpeakerManager for loading embedding files and computing d_vectors from waveforms""" @staticmethod def test_speaker_embedding(): @@ -38,38 +38,38 @@ class SpeakerManagerTest(unittest.TestCase): # load a sample audio and compute embedding waveform = ap.load_wav(sample_wav_path) mel = ap.melspectrogram(waveform) - x_vector = manager.compute_x_vector(mel.T) - assert x_vector.shape[1] == 256 + d_vector = manager.compute_d_vector(mel.T) + assert d_vector.shape[1] == 256 - # compute x_vector directly from an input file - x_vector = manager.compute_x_vector_from_clip(sample_wav_path) - x_vector2 = manager.compute_x_vector_from_clip(sample_wav_path) - x_vector = torch.FloatTensor(x_vector) - x_vector2 = torch.FloatTensor(x_vector2) - assert x_vector.shape[0] == 256 - assert (x_vector - x_vector2).sum() == 0.0 + # compute d_vector directly from an input file + d_vector = manager.compute_d_vector_from_clip(sample_wav_path) + d_vector2 = manager.compute_d_vector_from_clip(sample_wav_path) + d_vector = torch.FloatTensor(d_vector) + d_vector2 = torch.FloatTensor(d_vector2) + assert d_vector.shape[0] == 256 + assert (d_vector - d_vector2).sum() == 0.0 - # compute x_vector from a list of wav files. - x_vector3 = manager.compute_x_vector_from_clip([sample_wav_path, sample_wav_path2]) - x_vector3 = torch.FloatTensor(x_vector3) - assert x_vector3.shape[0] == 256 - assert (x_vector - x_vector3).sum() != 0.0 + # compute d_vector from a list of wav files. + d_vector3 = manager.compute_d_vector_from_clip([sample_wav_path, sample_wav_path2]) + d_vector3 = torch.FloatTensor(d_vector3) + assert d_vector3.shape[0] == 256 + assert (d_vector - d_vector3).sum() != 0.0 # remove dummy model os.remove(encoder_model_path) @staticmethod def test_speakers_file_processing(): - manager = SpeakerManager(x_vectors_file_path=x_vectors_file_path) + manager = SpeakerManager(d_vectors_file_path=d_vectors_file_path) print(manager.num_speakers) - print(manager.x_vector_dim) + print(manager.d_vector_dim) print(manager.clip_ids) - x_vector = manager.get_x_vector_by_clip(manager.clip_ids[0]) - assert len(x_vector) == 256 - x_vectors = manager.get_x_vectors_by_speaker(manager.speaker_ids[0]) - assert len(x_vectors[0]) == 256 - x_vector1 = manager.get_mean_x_vector(manager.speaker_ids[0], num_samples=2, randomize=True) - assert len(x_vector1) == 256 - x_vector2 = manager.get_mean_x_vector(manager.speaker_ids[0], num_samples=2, randomize=False) - assert len(x_vector2) == 256 - assert np.sum(np.array(x_vector1) - np.array(x_vector2)) != 0 + d_vector = manager.get_d_vector_by_clip(manager.clip_ids[0]) + assert len(d_vector) == 256 + d_vectors = manager.get_d_vectors_by_speaker(manager.speaker_ids[0]) + assert len(d_vectors[0]) == 256 + d_vector1 = manager.get_mean_d_vector(manager.speaker_ids[0], num_samples=2, randomize=True) + assert len(d_vector1) == 256 + d_vector2 = manager.get_mean_d_vector(manager.speaker_ids[0], num_samples=2, randomize=False) + assert len(d_vector2) == 256 + assert np.sum(np.array(d_vector1) - np.array(d_vector2)) != 0 diff --git a/tests/tts_tests/test_speedy_speech_layers.py b/tests/tts_tests/test_speedy_speech_layers.py index 66339a82..7c4f0adf 100644 --- a/tests/tts_tests/test_speedy_speech_layers.py +++ b/tests/tts_tests/test_speedy_speech_layers.py @@ -57,7 +57,7 @@ def test_speedy_speech(): # with speaker embedding model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128, num_speakers=10, c_in_channels=256).to(device) model.forward( - x_dummy, x_lengths, y_lengths, durations, cond_input={"x_vectors": torch.randint(0, 10, (B,)).to(device)} + x_dummy, x_lengths, y_lengths, durations, cond_input={"d_vectors": torch.randint(0, 10, (B,)).to(device)} ) o_de = outputs["model_outputs"] attn = outputs["alignments"] @@ -71,7 +71,7 @@ def test_speedy_speech(): model = SpeedySpeech( num_chars, out_channels=80, hidden_channels=128, num_speakers=10, external_c=True, c_in_channels=256 ).to(device) - model.forward(x_dummy, x_lengths, y_lengths, durations, cond_input={"x_vectors": torch.rand((B, 256)).to(device)}) + model.forward(x_dummy, x_lengths, y_lengths, durations, cond_input={"d_vectors": torch.rand((B, 256)).to(device)}) o_de = outputs["model_outputs"] attn = outputs["alignments"] o_dr = outputs["durations_log"] diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index 0933ec70..b77f7cc5 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -95,7 +95,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, speaker_embedding_dim=55).to(device) + model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 @@ -105,7 +105,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_ids} + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_ids} ) assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 @@ -259,7 +259,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, speaker_embedding_dim=55, use_gst=True, gst=c.gst).to( + model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55, use_gst=True, gst=c.gst).to( device ) model.train() @@ -271,7 +271,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings} + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_embeddings} ) assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py index 86de5d16..31682d7a 100644 --- a/tests/tts_tests/test_tacotron_model.py +++ b/tests/tts_tests/test_tacotron_model.py @@ -116,7 +116,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): decoder_output_dim=c.audio["num_mels"], r=c.r, memory_size=c.memory_size, - speaker_embedding_dim=55, + d_vector_dim=55, ).to( device ) # FIXME: missing num_speakers parameter to Tacotron ctor @@ -130,7 +130,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings} + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_embeddings} ) optimizer.zero_grad() loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) @@ -305,7 +305,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): gst=c.gst, r=c.r, memory_size=c.memory_size, - speaker_embedding_dim=55, + d_vector_dim=55, ).to( device ) # FIXME: missing num_speakers parameter to Tacotron ctor @@ -319,7 +319,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings} + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_embeddings} ) optimizer.zero_grad() loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) From 802d4613890917f400b70ca63b4f9e66560063a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 3 Jun 2021 11:46:53 +0200 Subject: [PATCH 176/258] Compute d_vectors and speaker_ids separately in TTSDataset --- TTS/bin/extract_tts_spectrograms.py | 19 ++++---------- TTS/trainer.py | 40 +++++++++++------------------ TTS/tts/datasets/TTSDataset.py | 35 +++++++++++++++++-------- 3 files changed, 44 insertions(+), 50 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 3acf5d02..d17bcb30 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -39,7 +39,8 @@ def setup_loader(ap, r, verbose=False): enable_eos_bos=c.enable_eos_bos_chars, use_noise_augment=False, verbose=verbose, - speaker_mapping=speaker_manager.speaker_ids + speaker_id_mapping=speaker_manager.speaker_ids, + d_vector_mapping=speaker_manager.d_vectors if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None, ) @@ -84,22 +85,12 @@ def format_data(data): mel_input = data[4] mel_lengths = data[5] item_idx = data[7] - attn_mask = data[9] + d_vectors = data[8] + speaker_ids = data[9] + attn_mask = data[10] avg_text_length = torch.mean(text_lengths.float()) avg_spec_length = torch.mean(mel_lengths.float()) - if c.use_speaker_embedding: - if c.use_external_speaker_embedding_file: - speaker_embeddings = data[8] - speaker_ids = None - else: - speaker_ids = [speaker_manager.speaker_ids[speaker_name] for speaker_name in speaker_names] - speaker_ids = torch.LongTensor(speaker_ids) - speaker_embeddings = None - else: - speaker_embeddings = None - speaker_ids = None - # dispatch data to GPU if use_cuda: text_input = text_input.cuda(non_blocking=True) diff --git a/TTS/trainer.py b/TTS/trainer.py index 55560624..7136e023 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -267,7 +267,8 @@ class TrainerTTS: is_eval: bool, data_items: List, verbose: bool, - speaker_mapping: Union[Dict, List], + speaker_ids: Union[Dict, List], + d_vectors: Union[Dict, List] ) -> DataLoader: if is_eval and not self.config.run_eval: loader = None @@ -289,9 +290,10 @@ class TrainerTTS: enable_eos_bos=self.config.enable_eos_bos_chars, use_noise_augment=not is_eval, verbose=verbose, - speaker_mapping=speaker_mapping - if self.config.use_speaker_embedding and self.config.use_external_speaker_embedding_file - else None, + speaker_id_mapping=speaker_ids + if self.config.use_speaker_embedding else None, + d_vector_mapping=d_vectors + if self.config.use_speaker_embedding and self.config.use_external_speaker_embedding_file else None, ) if self.config.use_phonemes and self.config.compute_input_seq_cache: @@ -313,14 +315,14 @@ class TrainerTTS: return loader def get_train_dataloader( - self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_mapping: Union[List, Dict] + self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_ids: Dict, d_vectors: Dict ) -> DataLoader: - return self._get_loader(r, ap, False, data_items, verbose, speaker_mapping) + return self._get_loader(r, ap, False, data_items, verbose, speaker_ids, d_vectors) def get_eval_dataloder( - self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_mapping: Union[List, Dict] + self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_ids: Dict, d_vectors: Dict ) -> DataLoader: - return self._get_loader(r, ap, True, data_items, verbose, speaker_mapping) + return self._get_loader(r, ap, True, data_items, verbose, speaker_ids, d_vectors) def format_batch(self, batch: List) -> Dict: # setup input batch @@ -332,24 +334,12 @@ class TrainerTTS: mel_lengths = batch[5] stop_targets = batch[6] item_idx = batch[7] - speaker_embeddings = batch[8] - attn_mask = batch[9] + d_vectors = batch[8] + speaker_ids = batch[9] + attn_mask = batch[10] max_text_length = torch.max(text_lengths.float()) max_spec_length = torch.max(mel_lengths.float()) - # convert speaker names to ids - if self.config.use_speaker_embedding: - if self.config.use_external_speaker_embedding_file: - speaker_embeddings = batch[8] - speaker_ids = None - else: - speaker_ids = [self.speaker_manager.speaker_ids[speaker_name] for speaker_name in speaker_names] - speaker_ids = torch.LongTensor(speaker_ids) - speaker_embeddings = None - else: - speaker_embeddings = None - speaker_ids = None - # compute durations from attention masks durations = None if attn_mask is not None: @@ -640,11 +630,11 @@ class TrainerTTS: # define data loaders self.train_loader = self.get_train_dataloader( - self.config.r, self.ap, self.data_train, verbose=True, speaker_mapping=self.speaker_manager.speaker_ids + self.config.r, self.ap, self.data_train, verbose=True, speaker_ids=self.speaker_manager.speaker_ids, d_vectors=self.speaker_manager.d_vectors ) self.eval_loader = ( self.get_eval_dataloder( - self.config.r, self.ap, self.data_train, verbose=True, speaker_mapping=self.speaker_manager.speaker_ids + self.config.r, self.ap, self.data_train, verbose=True, speaker_ids=self.speaker_manager.speaker_ids, d_vectors=self.speaker_manager.d_vectors ) if self.config.run_eval else None diff --git a/TTS/tts/datasets/TTSDataset.py b/TTS/tts/datasets/TTSDataset.py index 76f82c97..2522b55a 100644 --- a/TTS/tts/datasets/TTSDataset.py +++ b/TTS/tts/datasets/TTSDataset.py @@ -29,7 +29,8 @@ class TTSDataset(Dataset): phoneme_cache_path=None, phoneme_language="en-us", enable_eos_bos=False, - speaker_mapping=None, + speaker_id_mapping=None, + d_vector_mapping=None, use_noise_augment=False, verbose=False, ): @@ -51,6 +52,8 @@ class TTSDataset(Dataset): phoneme_language (str): one the languages from https://github.com/bootphon/phonemizer#languages enable_eos_bos (bool): enable end of sentence and beginning of sentences characters. + speaker_id_mapping (dict): list of speaker ids to map speaker names to numerical ids. + d_vector_mapping (dict): dictionary of d-vectors that maps each audio file to a pre-computed d-vector. use_noise_augment (bool): enable adding random noise to wav for augmentation. verbose (bool): print diagnostic information. """ @@ -70,7 +73,8 @@ class TTSDataset(Dataset): self.phoneme_cache_path = phoneme_cache_path self.phoneme_language = phoneme_language self.enable_eos_bos = enable_eos_bos - self.speaker_mapping = speaker_mapping + self.speaker_id_mapping = speaker_id_mapping + self.d_vector_mapping = d_vector_mapping self.use_noise_augment = use_noise_augment self.verbose = verbose self.input_seq_computed = False @@ -293,13 +297,18 @@ class TTSDataset(Dataset): item_idxs = [batch[idx]["item_idx"] for idx in ids_sorted_decreasing] text = [batch[idx]["text"] for idx in ids_sorted_decreasing] - speaker_name = [batch[idx]["speaker_name"] for idx in ids_sorted_decreasing] - # get speaker embeddings - if self.speaker_mapping is not None: + speaker_names = [batch[idx]["speaker_name"] for idx in ids_sorted_decreasing] + # get pre-computed d-vectors + if self.d_vector_mapping is not None: wav_files_names = [batch[idx]["wav_file_name"] for idx in ids_sorted_decreasing] - speaker_embedding = [self.speaker_mapping[w]["embedding"] for w in wav_files_names] + d_vectors = [self.speaker_mapping[w]["embedding"] for w in wav_files_names] else: - speaker_embedding = None + d_vectors = None + # get numerical speaker ids from speaker names + if self.speaker_id_mapping: + speaker_ids = [self.speaker_manager.speaker_ids[sn] for sn in speaker_names] + else: + speaker_ids = None # compute features mel = [self.ap.melspectrogram(w).astype("float32") for w in wav] @@ -327,8 +336,11 @@ class TTSDataset(Dataset): mel_lengths = torch.LongTensor(mel_lengths) stop_targets = torch.FloatTensor(stop_targets) - if speaker_embedding is not None: - speaker_embedding = torch.FloatTensor(speaker_embedding) + if d_vectors is not None: + d_vectors = torch.FloatTensor(d_vectors) + + if speaker_ids is not None: + speaker_ids = torch.LongTensor(speaker_ids) # compute linear spectrogram if self.compute_linear_spec: @@ -355,13 +367,14 @@ class TTSDataset(Dataset): return ( text, text_lenghts, - speaker_name, + speaker_names, linear, mel, mel_lengths, stop_targets, item_idxs, - speaker_embedding, + d_vectors, + speaker_ids, attns, ) From d6b2b6add63fcef286806d30a32f0c3229c7a000 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 3 Jun 2021 13:05:54 +0200 Subject: [PATCH 177/258] make style and linter fixes --- TTS/bin/extract_tts_spectrograms.py | 1 - TTS/trainer.py | 23 +++++++++++++++++------ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index d17bcb30..24665871 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -81,7 +81,6 @@ def format_data(data): # setup input data text_input = data[0] text_lengths = data[1] - speaker_names = data[2] mel_input = data[4] mel_lengths = data[5] item_idx = data[7] diff --git a/TTS/trainer.py b/TTS/trainer.py index 7136e023..f837ce7f 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -268,7 +268,7 @@ class TrainerTTS: data_items: List, verbose: bool, speaker_ids: Union[Dict, List], - d_vectors: Union[Dict, List] + d_vectors: Union[Dict, List], ) -> DataLoader: if is_eval and not self.config.run_eval: loader = None @@ -290,10 +290,10 @@ class TrainerTTS: enable_eos_bos=self.config.enable_eos_bos_chars, use_noise_augment=not is_eval, verbose=verbose, - speaker_id_mapping=speaker_ids - if self.config.use_speaker_embedding else None, + speaker_id_mapping=speaker_ids if self.config.use_speaker_embedding else None, d_vector_mapping=d_vectors - if self.config.use_speaker_embedding and self.config.use_external_speaker_embedding_file else None, + if self.config.use_speaker_embedding and self.config.use_external_speaker_embedding_file + else None, ) if self.config.use_phonemes and self.config.compute_input_seq_cache: @@ -383,6 +383,7 @@ class TrainerTTS: return { "text_input": text_input, "text_lengths": text_lengths, + "speaker_names": speaker_names, "mel_input": mel_input, "mel_lengths": mel_lengths, "linear_input": linear_input, @@ -630,11 +631,21 @@ class TrainerTTS: # define data loaders self.train_loader = self.get_train_dataloader( - self.config.r, self.ap, self.data_train, verbose=True, speaker_ids=self.speaker_manager.speaker_ids, d_vectors=self.speaker_manager.d_vectors + self.config.r, + self.ap, + self.data_train, + verbose=True, + speaker_ids=self.speaker_manager.speaker_ids, + d_vectors=self.speaker_manager.d_vectors, ) self.eval_loader = ( self.get_eval_dataloder( - self.config.r, self.ap, self.data_train, verbose=True, speaker_ids=self.speaker_manager.speaker_ids, d_vectors=self.speaker_manager.d_vectors + self.config.r, + self.ap, + self.data_train, + verbose=True, + speaker_ids=self.speaker_manager.speaker_ids, + d_vectors=self.speaker_manager.d_vectors, ) if self.config.run_eval else None From 2c38ef8441d5162cc6eb76d94625386fb5543bc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sat, 5 Jun 2021 11:46:53 +0200 Subject: [PATCH 178/258] use get_speaker_manager in Trainer and save speakers.json file when needed --- TTS/trainer.py | 22 ++-------------------- TTS/tts/utils/speakers.py | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 24 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index f837ce7f..564c4c26 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -21,7 +21,7 @@ from TTS.tts.datasets import TTSDataset, load_meta_data from TTS.tts.layers import setup_loss from TTS.tts.models import setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint -from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.symbols import make_symbols from TTS.tts.utils.visual import plot_alignment, plot_spectrogram @@ -186,25 +186,7 @@ class TrainerTTS: def get_speaker_manager( config: Coqpit, restore_path: str = "", out_path: str = "", data_train: List = None ) -> SpeakerManager: - speaker_manager = SpeakerManager() - if restore_path: - speakers_file = os.path.join(os.path.dirname(restore_path), "speaker.json") - if not os.path.exists(speakers_file): - print( - "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" - ) - speakers_file = config.external_speaker_embedding_file - - if config.use_external_speaker_embedding_file: - speaker_manager.load_d_vectors_file(speakers_file) - else: - speaker_manager.load_ids_file(speakers_file) - elif config.use_external_speaker_embedding_file and config.external_speaker_embedding_file: - speaker_manager.load_d_vectors_file(config.external_speaker_embedding_file) - else: - speaker_manager.parse_speakers_from_items(data_train) - file_path = os.path.join(out_path, "speakers.json") - speaker_manager.save_ids_file(file_path) + speaker_manager = get_speaker_manager(config, restore_path, data_train, out_path) return speaker_manager @staticmethod diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 546d483d..0f43bf97 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -34,16 +34,16 @@ def save_speaker_mapping(out_path, speaker_mapping): json.dump(speaker_mapping, f, indent=4) -def get_speaker_manager(c, args, meta_data_train): +def get_speaker_manager(c, restore_path, meta_data_train, out_path=None): """Inititalize and return a `SpeakerManager` based on config values""" speaker_manager = SpeakerManager() if c.use_speaker_embedding: speaker_manager.set_speaker_ids_from_data(meta_data_train) - if args.restore_path: + if restore_path: # restoring speaker manager from a previous run. if c.use_external_speaker_embedding_file: # restore speaker manager with the embedding file - speakers_file = os.path.dirname(args.restore_path) + speakers_file = os.path.dirname(restore_path) if not os.path.exists(speakers_file): print( "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" @@ -55,7 +55,7 @@ def get_speaker_manager(c, args, meta_data_train): speaker_manager.load_d_vectors_file(c.external_speaker_embedding_file) speaker_manager.set_d_vectors_from_file(speakers_file) elif not c.use_external_speaker_embedding_file: # restor speaker manager with speaker ID file. - speakers_file = os.path.dirname(args.restore_path) + speakers_file = os.path.dirname(restore_path) speaker_ids_from_data = speaker_manager.speaker_ids speaker_manager.set_speaker_ids_from_file(speakers_file) assert all( @@ -73,6 +73,14 @@ def get_speaker_manager(c, args, meta_data_train): speaker_manager.num_speakers, ", ".join(speaker_manager.speaker_ids) ) ) + # save file if path is defined + if out_path: + out_file_path = os.path.join(out_path, "speaker.json") + print(" > Saving `speaker.json` to {out_file_path}.") + if c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: + speaker_manager.save_d_vectors_to_file(out_file_path) + else: + speaker_manager.save_speaker_ids_to_file(out_file_path) return speaker_manager From 304d60197b3bdd94001bfa0fb9162c769440f392 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sat, 5 Jun 2021 11:48:16 +0200 Subject: [PATCH 179/258] reduce multiband melgan test model size --- tests/vocoder_tests/test_fullband_melgan_train.py | 1 - tests/vocoder_tests/test_multiband_melgan_train.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py index fbce03eb..f93a5318 100644 --- a/tests/vocoder_tests/test_fullband_melgan_train.py +++ b/tests/vocoder_tests/test_fullband_melgan_train.py @@ -20,7 +20,6 @@ config = FullbandMelganConfig( eval_split_size=1, print_step=1, print_eval=True, - discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, data_path="tests/data/ljspeech", discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, output_path=output_path, diff --git a/tests/vocoder_tests/test_multiband_melgan_train.py b/tests/vocoder_tests/test_multiband_melgan_train.py index ef362414..5c6a0fc8 100644 --- a/tests/vocoder_tests/test_multiband_melgan_train.py +++ b/tests/vocoder_tests/test_multiband_melgan_train.py @@ -22,6 +22,7 @@ config = MultibandMelganConfig( print_eval=True, discriminator_model_params={"base_channels": 16, "max_channels": 128, "downsample_factors": [4, 4, 4]}, data_path="tests/data/ljspeech", + discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, output_path=output_path, ) config.audio.do_trim_silence = True From b3324bd9145d605261c86c09e37d51e70b37633a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 6 Jun 2021 13:21:13 +0200 Subject: [PATCH 180/258] fix speaker_manager init --- TTS/trainer.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index 564c4c26..9fe2f108 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -101,9 +101,7 @@ class TrainerTTS: self.data_train, self.data_eval = load_meta_data(self.config.datasets) # default speaker manager - self.speaker_manager = self.get_speaker_manager( - self.config, args.restore_path, self.config.output_path, self.data_train - ) + self.speaker_manager = self.get_speaker_manager(self.config, args.restore_path, output_path, self.data_train) # init TTS model if model is not None: @@ -587,7 +585,7 @@ class TrainerTTS: speaker_id = 0 if self.config.use_speaker_embedding else None # setup d_vector d_vector = ( - self.speaker_manager.get_d_vectors_by_speaker(self.speaker_manager.speaker_ids[0]) + self.speaker_manager.get_d_vectors_by_speaker(self.speaker_manager.speaker_names[0]) if self.config.use_external_speaker_embedding_file and self.config.use_speaker_embedding else None ) From 269e5a734ee04b7274b4ad6aaec52b3c5ad6679d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 6 Jun 2021 13:38:01 +0200 Subject: [PATCH 181/258] add max_decoder_steps argument to tacotron models --- TTS/tts/configs/tacotron_config.py | 3 +++ TTS/tts/layers/tacotron/tacotron.py | 4 +++- TTS/tts/layers/tacotron/tacotron2.py | 4 +++- TTS/tts/models/__init__.py | 2 ++ TTS/tts/models/tacotron.py | 4 ++++ TTS/tts/models/tacotron2.py | 4 ++++ tests/tts_tests/test_tacotron2_train.py | 1 + tests/tts_tests/test_tacotron_layers.py | 1 + tests/tts_tests/test_tacotron_train.py | 2 ++ 9 files changed, 23 insertions(+), 2 deletions(-) diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index b197eaf6..2b67901c 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -46,6 +46,8 @@ class TacotronConfig(BaseTTSConfig): stopnet_pos_weight (float): Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with datasets with longer sentences. Defaults to 10. + max_decoder_steps (int): + Max number of steps allowed for the decoder. Defaults to 10000. separate_stopnet (bool): Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True. attention_type (str): @@ -137,6 +139,7 @@ class TacotronConfig(BaseTTSConfig): stopnet: bool = True separate_stopnet: bool = True stopnet_pos_weight: float = 10.0 + max_decoder_steps: int = 10000 # attention layers attention_type: str = "original" diff --git a/TTS/tts/layers/tacotron/tacotron.py b/TTS/tts/layers/tacotron/tacotron.py index 2f94db88..a6579171 100644 --- a/TTS/tts/layers/tacotron/tacotron.py +++ b/TTS/tts/layers/tacotron/tacotron.py @@ -267,6 +267,7 @@ class Decoder(nn.Module): attn_K (int): number of attention heads for GravesAttention. separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow. d_vector_dim (int): size of speaker embedding vector, for multi-speaker training. + max_decoder_steps (int): Maximum number of steps allowed for the decoder. Defaults to 500. """ # Pylint gets confused by PyTorch conventions here @@ -289,12 +290,13 @@ class Decoder(nn.Module): location_attn, attn_K, separate_stopnet, + max_decoder_steps, ): super().__init__() self.r_init = r self.r = r self.in_channels = in_channels - self.max_decoder_steps = 500 + self.max_decoder_steps = max_decoder_steps self.use_memory_queue = memory_size > 0 self.memory_size = memory_size if memory_size > 0 else r self.frame_channels = frame_channels diff --git a/TTS/tts/layers/tacotron/tacotron2.py b/TTS/tts/layers/tacotron/tacotron2.py index aeca8953..61fe9f4b 100644 --- a/TTS/tts/layers/tacotron/tacotron2.py +++ b/TTS/tts/layers/tacotron/tacotron2.py @@ -135,6 +135,7 @@ class Decoder(nn.Module): location_attn (bool): if true, use location sensitive attention. attn_K (int): number of attention heads for GravesAttention. separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow. + max_decoder_steps (int): Maximum number of steps allowed for the decoder. Defaults to 10000. """ # Pylint gets confused by PyTorch conventions here @@ -155,6 +156,7 @@ class Decoder(nn.Module): location_attn, attn_K, separate_stopnet, + max_decoder_steps, ): super().__init__() self.frame_channels = frame_channels @@ -162,7 +164,7 @@ class Decoder(nn.Module): self.r = r self.encoder_embedding_dim = in_channels self.separate_stopnet = separate_stopnet - self.max_decoder_steps = 1000 + self.max_decoder_steps = max_decoder_steps self.stop_threshold = 0.5 # model dimensions diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py index 026f5c85..2a951267 100644 --- a/TTS/tts/models/__init__.py +++ b/TTS/tts/models/__init__.py @@ -30,6 +30,7 @@ def setup_model(num_chars, num_speakers, c, d_vector_dim=None): double_decoder_consistency=c.double_decoder_consistency, ddc_r=c.ddc_r, d_vector_dim=d_vector_dim, + max_decoder_steps=c.max_decoder_steps, ) elif c.model.lower() == "tacotron2": model = MyModel( @@ -56,6 +57,7 @@ def setup_model(num_chars, num_speakers, c, d_vector_dim=None): double_decoder_consistency=c.double_decoder_consistency, ddc_r=c.ddc_r, d_vector_dim=d_vector_dim, + max_decoder_steps=c.max_decoder_steps, ) elif c.model.lower() == "glow_tts": model = MyModel( diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 123b69a7..5eeeedaa 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -49,6 +49,7 @@ class Tacotron(TacotronAbstract): output frames to the prenet. gradual_trainin (List): Gradual training schedule. If None or `[]`, no gradual training is used. Defaults to `[]`. + max_decoder_steps (int): Maximum number of steps allowed for the decoder. Defaults to 10000. """ def __init__( @@ -80,6 +81,7 @@ class Tacotron(TacotronAbstract): gst=None, memory_size=5, gradual_training=None, + max_decoder_steps=500, ): super().__init__( num_chars, @@ -143,6 +145,7 @@ class Tacotron(TacotronAbstract): location_attn, attn_K, separate_stopnet, + max_decoder_steps, ) self.postnet = PostCBHG(decoder_output_dim) self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, postnet_output_dim) @@ -180,6 +183,7 @@ class Tacotron(TacotronAbstract): location_attn, attn_K, separate_stopnet, + max_decoder_steps, ) def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, cond_input=None): diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 4628c64e..b6da4e44 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -47,6 +47,7 @@ class Tacotron2(TacotronAbstract): gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. gradual_training (List): Gradual training schedule. If None or `[]`, no gradual training is used. Defaults to `[]`. + max_decoder_steps (int): Maximum number of steps allowed for the decoder. Defaults to 10000. """ def __init__( @@ -77,6 +78,7 @@ class Tacotron2(TacotronAbstract): use_gst=False, gst=None, gradual_training=None, + max_decoder_steps=500, ): super().__init__( num_chars, @@ -138,6 +140,7 @@ class Tacotron2(TacotronAbstract): location_attn, attn_K, separate_stopnet, + max_decoder_steps, ) self.postnet = Postnet(self.postnet_output_dim) @@ -174,6 +177,7 @@ class Tacotron2(TacotronAbstract): location_attn, attn_K, separate_stopnet, + max_decoder_steps, ) @staticmethod diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py index 0d9a67a5..face77ae 100644 --- a/tests/tts_tests/test_tacotron2_train.py +++ b/tests/tts_tests/test_tacotron2_train.py @@ -24,6 +24,7 @@ config = Tacotron2Config( epochs=1, print_step=1, print_eval=True, + max_decoder_steps=50, ) config.audio.do_trim_silence = True config.audio.trim_db = 60 diff --git a/tests/tts_tests/test_tacotron_layers.py b/tests/tts_tests/test_tacotron_layers.py index 6c4b76b5..783be0db 100644 --- a/tests/tts_tests/test_tacotron_layers.py +++ b/tests/tts_tests/test_tacotron_layers.py @@ -61,6 +61,7 @@ class DecoderTests(unittest.TestCase): forward_attn_mask=True, location_attn=True, separate_stopnet=True, + max_decoder_steps=50, ) dummy_input = T.rand(4, 8, 256) dummy_memory = T.rand(4, 2, 80) diff --git a/tests/tts_tests/test_tacotron_train.py b/tests/tts_tests/test_tacotron_train.py index 52560715..9443d73a 100644 --- a/tests/tts_tests/test_tacotron_train.py +++ b/tests/tts_tests/test_tacotron_train.py @@ -23,6 +23,8 @@ config = TacotronConfig( epochs=1, print_step=1, print_eval=True, + r=5, + max_decoder_steps=50, ) config.audio.do_trim_silence = True config.audio.trim_db = 60 From 419735f4401fd7a96b1b6fe2b87ddbc8b11cfe85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 6 Jun 2021 13:39:52 +0200 Subject: [PATCH 182/258] refactor and fix multi-speaker training in Trainer and Tacotron models --- TTS/tts/datasets/TTSDataset.py | 4 +- TTS/tts/datasets/formatters.py | 15 + TTS/tts/models/tacotron.py | 7 +- TTS/tts/models/tacotron2.py | 10 +- TTS/tts/utils/speakers.py | 33 +- tests/data/ljspeech/speakers.json | 2612 +++++++++++++++++ tests/data_tests/__init__.py | 0 tests/inference_tests/__init__.py | 0 tests/test_speaker_manager.py | 6 +- tests/text_tests/__init__.py | 0 .../test_tacotron2_d-vectors_train.py | 57 + .../test_tacotron2_speaker_emb_train.py | 55 + 12 files changed, 2779 insertions(+), 20 deletions(-) create mode 100644 tests/data/ljspeech/speakers.json create mode 100644 tests/data_tests/__init__.py create mode 100644 tests/inference_tests/__init__.py create mode 100644 tests/text_tests/__init__.py create mode 100644 tests/tts_tests/test_tacotron2_d-vectors_train.py create mode 100644 tests/tts_tests/test_tacotron2_speaker_emb_train.py diff --git a/TTS/tts/datasets/TTSDataset.py b/TTS/tts/datasets/TTSDataset.py index 2522b55a..d0fbb553 100644 --- a/TTS/tts/datasets/TTSDataset.py +++ b/TTS/tts/datasets/TTSDataset.py @@ -301,12 +301,12 @@ class TTSDataset(Dataset): # get pre-computed d-vectors if self.d_vector_mapping is not None: wav_files_names = [batch[idx]["wav_file_name"] for idx in ids_sorted_decreasing] - d_vectors = [self.speaker_mapping[w]["embedding"] for w in wav_files_names] + d_vectors = [self.d_vector_mapping[w]["embedding"] for w in wav_files_names] else: d_vectors = None # get numerical speaker ids from speaker names if self.speaker_id_mapping: - speaker_ids = [self.speaker_manager.speaker_ids[sn] for sn in speaker_names] + speaker_ids = [self.speaker_id_mapping[sn] for sn in speaker_names] else: speaker_ids = None # compute features diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 815a1b1d..3cb37168 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -107,6 +107,21 @@ def ljspeech(root_path, meta_file): return items +def ljspeech_test(root_path, meta_file): + """Normalizes the LJSpeech meta data file for TTS testing + https://keithito.com/LJ-Speech-Dataset/""" + txt_file = os.path.join(root_path, meta_file) + items = [] + speaker_name = "ljspeech" + with open(txt_file, "r", encoding="utf-8") as ttf: + for idx, line in enumerate(ttf): + cols = line.split("|") + wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav") + text = cols[1] + items.append([text, wav_file, f"ljspeech-{idx}"]) + return items + + def sam_accenture(root_path, meta_file): """Normalizes the sam-accenture meta data file to TTS format https://github.com/Sam-Accenture-Non-Binary-Voice/non-binary-voice-files""" diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 5eeeedaa..3ee70431 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -262,7 +262,12 @@ class Tacotron(TacotronAbstract): if self.num_speakers > 1: if not self.use_d_vectors: # B x 1 x speaker_embed_dim - embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[:, None] + embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"]) + # reshape embedded_speakers + if embedded_speakers.ndim == 1: + embedded_speakers = embedded_speakers[None, None, :] + elif embedded_speakers.ndim == 2: + embedded_speakers = embedded_speakers[None, :] else: # B x 1 x speaker_embed_dim embedded_speakers = torch.unsqueeze(cond_input["d_vectors"], 1) diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index b6da4e44..f6e59542 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -261,9 +261,13 @@ class Tacotron2(TacotronAbstract): # B x gst_dim encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["d_vectors"]) if self.num_speakers > 1: - if not self.embeddings_per_sample: - x_vector = self.speaker_embedding(cond_input['speaker_ids'])[:, None] - x_vector = torch.unsqueeze(x_vector, 0).transpose(1, 2) + if not self.use_d_vectors: + embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[None] + # reshape embedded_speakers + if embedded_speakers.ndim == 1: + embedded_speakers = embedded_speakers[None, None, :] + elif embedded_speakers.ndim == 2: + embedded_speakers = embedded_speakers[None, :] else: embedded_speakers = cond_input["d_vectors"] diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 0f43bf97..01e26c6b 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -11,9 +11,16 @@ from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.utils.audio import AudioProcessor -def make_speakers_json_path(out_path): - """Returns conventional speakers.json location.""" - return os.path.join(out_path, "speakers.json") +def _set_file_path(path): + """Find the speakers.json under the given path or the above it. + Intended to band aid the different paths returned in restored and continued training.""" + path_restore = os.path.join(os.path.dirname(path), "speakers.json") + path_continue = os.path.join(path, "speakers.json") + if os.path.exists(path_restore): + return path_restore + if os.path.exists(path_continue): + return path_continue + raise FileNotFoundError(f" [!] `speakers.json` not found in {path}") def load_speaker_mapping(out_path): @@ -21,7 +28,7 @@ def load_speaker_mapping(out_path): if os.path.splitext(out_path)[1] == ".json": json_file = out_path else: - json_file = make_speakers_json_path(out_path) + json_file = _set_file_path(out_path) with open(json_file) as f: return json.load(f) @@ -29,7 +36,7 @@ def load_speaker_mapping(out_path): def save_speaker_mapping(out_path, speaker_mapping): """Saves speaker mapping if not yet present.""" if out_path is not None: - speakers_json_path = make_speakers_json_path(out_path) + speakers_json_path = _set_file_path(out_path) with open(speakers_json_path, "w") as f: json.dump(speaker_mapping, f, indent=4) @@ -40,10 +47,10 @@ def get_speaker_manager(c, restore_path, meta_data_train, out_path=None): if c.use_speaker_embedding: speaker_manager.set_speaker_ids_from_data(meta_data_train) if restore_path: + speakers_file = _set_file_path(restore_path) # restoring speaker manager from a previous run. if c.use_external_speaker_embedding_file: # restore speaker manager with the embedding file - speakers_file = os.path.dirname(restore_path) if not os.path.exists(speakers_file): print( "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" @@ -55,7 +62,6 @@ def get_speaker_manager(c, restore_path, meta_data_train, out_path=None): speaker_manager.load_d_vectors_file(c.external_speaker_embedding_file) speaker_manager.set_d_vectors_from_file(speakers_file) elif not c.use_external_speaker_embedding_file: # restor speaker manager with speaker ID file. - speakers_file = os.path.dirname(restore_path) speaker_ids_from_data = speaker_manager.speaker_ids speaker_manager.set_speaker_ids_from_file(speakers_file) assert all( @@ -75,8 +81,8 @@ def get_speaker_manager(c, restore_path, meta_data_train, out_path=None): ) # save file if path is defined if out_path: - out_file_path = os.path.join(out_path, "speaker.json") - print(" > Saving `speaker.json` to {out_file_path}.") + out_file_path = os.path.join(out_path, "speakers.json") + print(f" > Saving `speakers.json` to {out_file_path}.") if c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: speaker_manager.save_d_vectors_to_file(out_file_path) else: @@ -138,7 +144,7 @@ class SpeakerManager: self.speaker_encoder_ap = None if data_items: - self.speaker_ids, _ = self.parse_speakers_from_data(self.data_items) + self.speaker_ids, self.speaker_names, _ = self.parse_speakers_from_data(self.data_items) if d_vectors_file_path: self.set_d_vectors_from_file(d_vectors_file_path) @@ -163,6 +169,10 @@ class SpeakerManager: def num_speakers(self): return len(self.speaker_ids) + @property + def speaker_names(self): + return list(self.speaker_ids.keys()) + @property def d_vector_dim(self): """Dimensionality of d_vectors. If d_vectors are not loaded, returns zero.""" @@ -224,7 +234,8 @@ class SpeakerManager: file_path (str): Path to the target json file. """ self.d_vectors = self._load_json(file_path) - self.speaker_ids = list(set(sorted(x["name"] for x in self.d_vectors.values()))) + speakers = sorted({x["name"] for x in self.d_vectors.values()}) + self.speaker_ids = {name: i for i, name in enumerate(speakers)} self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys()))) def get_d_vector_by_clip(self, clip_idx: str) -> List: diff --git a/tests/data/ljspeech/speakers.json b/tests/data/ljspeech/speakers.json new file mode 100644 index 00000000..915cff73 --- /dev/null +++ b/tests/data/ljspeech/speakers.json @@ -0,0 +1,2612 @@ +{ + "LJ001-0001.wav": { + "name": "ljspeech-0", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0002.wav": { + "name": "ljspeech-1", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0003.wav": { + "name": "ljspeech-2", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0004.wav": { + "name": "ljspeech-3", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0005.wav": { + "name": "ljspeech-4", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0006.wav": { + "name": "ljspeech-5", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0007.wav": { + "name": "ljspeech-6", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0008.wav": { + "name": "ljspeech-7", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0009.wav": { + "name": "ljspeech-8", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0010.wav": { + "name": "ljspeech-9", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + } +} diff --git a/tests/data_tests/__init__.py b/tests/data_tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/inference_tests/__init__.py b/tests/inference_tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_speaker_manager.py b/tests/test_speaker_manager.py index a695fe61..baa50749 100644 --- a/tests/test_speaker_manager.py +++ b/tests/test_speaker_manager.py @@ -66,10 +66,10 @@ class SpeakerManagerTest(unittest.TestCase): print(manager.clip_ids) d_vector = manager.get_d_vector_by_clip(manager.clip_ids[0]) assert len(d_vector) == 256 - d_vectors = manager.get_d_vectors_by_speaker(manager.speaker_ids[0]) + d_vectors = manager.get_d_vectors_by_speaker(manager.speaker_names[0]) assert len(d_vectors[0]) == 256 - d_vector1 = manager.get_mean_d_vector(manager.speaker_ids[0], num_samples=2, randomize=True) + d_vector1 = manager.get_mean_d_vector(manager.speaker_names[0], num_samples=2, randomize=True) assert len(d_vector1) == 256 - d_vector2 = manager.get_mean_d_vector(manager.speaker_ids[0], num_samples=2, randomize=False) + d_vector2 = manager.get_mean_d_vector(manager.speaker_names[0], num_samples=2, randomize=False) assert len(d_vector2) == 256 assert np.sum(np.array(d_vector1) - np.array(d_vector2)) != 0 diff --git a/tests/text_tests/__init__.py b/tests/text_tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/tts_tests/test_tacotron2_d-vectors_train.py b/tests/tts_tests/test_tacotron2_d-vectors_train.py new file mode 100644 index 00000000..7fda7e09 --- /dev/null +++ b/tests/tts_tests/test_tacotron2_d-vectors_train.py @@ -0,0 +1,57 @@ +import glob +import os +import shutil + +from tests import get_device_id, get_tests_output_path, run_cli +from TTS.tts.configs import Tacotron2Config + +config_path = os.path.join(get_tests_output_path(), "test_model_config.json") +output_path = os.path.join(get_tests_output_path(), "train_outputs") + +config = Tacotron2Config( + r=5, + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_val_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + use_speaker_embedding=True, + use_external_speaker_embedding_file=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + external_speaker_embedding_file="tests/data/ljspeech/speakers.json", + max_decoder_steps=50, +) + +config.audio.do_trim_silence = True +config.audio.trim_db = 60 +config.save_json(config_path) + +# train the model for one epoch +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.name ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " +) +run_cli(command_train) + +# Find latest folder +continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + +# restore the model and continue training for one more epoch +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " +run_cli(command_train) +shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_tacotron2_speaker_emb_train.py b/tests/tts_tests/test_tacotron2_speaker_emb_train.py new file mode 100644 index 00000000..a242c724 --- /dev/null +++ b/tests/tts_tests/test_tacotron2_speaker_emb_train.py @@ -0,0 +1,55 @@ +import glob +import os +import shutil + +from tests import get_device_id, get_tests_output_path, run_cli +from TTS.tts.configs import Tacotron2Config + +config_path = os.path.join(get_tests_output_path(), "test_model_config.json") +output_path = os.path.join(get_tests_output_path(), "train_outputs") + +config = Tacotron2Config( + r=5, + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_val_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + use_speaker_embedding=True, + max_decoder_steps=50, +) + +config.audio.do_trim_silence = True +config.audio.trim_db = 60 +config.save_json(config_path) + +# train the model for one epoch +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.name ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " +) +run_cli(command_train) + +# Find latest folder +continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + +# restore the model and continue training for one more epoch +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " +run_cli(command_train) +shutil.rmtree(continue_path) From 82582993ccbd15668f8dfe81cf8c532619132b26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 6 Jun 2021 13:41:45 +0200 Subject: [PATCH 183/258] use one testing sentence in tts tests --- tests/tts_tests/test_align_tts_train.py | 3 +++ tests/tts_tests/test_glow_tts_train.py | 3 +++ tests/tts_tests/test_speedy_speech_train.py | 3 +++ tests/tts_tests/test_tacotron2_train.py | 3 +++ tests/tts_tests/test_tacotron_train.py | 3 +++ 5 files changed, 15 insertions(+) diff --git a/tests/tts_tests/test_align_tts_train.py b/tests/tts_tests/test_align_tts_train.py index 3d802d5f..61d67c5c 100644 --- a/tests/tts_tests/test_align_tts_train.py +++ b/tests/tts_tests/test_align_tts_train.py @@ -23,6 +23,9 @@ config = AlignTTSConfig( epochs=1, print_step=1, print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], ) config.audio.do_trim_silence = True config.audio.trim_db = 60 diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py index 7e6c069c..30aaefc4 100644 --- a/tests/tts_tests/test_glow_tts_train.py +++ b/tests/tts_tests/test_glow_tts_train.py @@ -24,6 +24,9 @@ config = GlowTTSConfig( epochs=1, print_step=1, print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], ) config.audio.do_trim_silence = True config.audio.trim_db = 60 diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index 6be3da97..d677f46f 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -23,6 +23,9 @@ config = SpeedySpeechConfig( epochs=1, print_step=1, print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], ) config.audio.do_trim_silence = True config.audio.trim_db = 60 diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py index face77ae..70975490 100644 --- a/tests/tts_tests/test_tacotron2_train.py +++ b/tests/tts_tests/test_tacotron2_train.py @@ -23,6 +23,9 @@ config = Tacotron2Config( test_delay_epochs=-1, epochs=1, print_step=1, + test_sentences=[ + "Be a voice, not an echo.", + ], print_eval=True, max_decoder_steps=50, ) diff --git a/tests/tts_tests/test_tacotron_train.py b/tests/tts_tests/test_tacotron_train.py index 9443d73a..010154e2 100644 --- a/tests/tts_tests/test_tacotron_train.py +++ b/tests/tts_tests/test_tacotron_train.py @@ -22,6 +22,9 @@ config = TacotronConfig( test_delay_epochs=-1, epochs=1, print_step=1, + test_sentences=[ + "Be a voice, not an echo.", + ], print_eval=True, r=5, max_decoder_steps=50, From 25238e0658697bbcb355bcd99c2a52eaf1910680 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 6 Jun 2021 13:42:35 +0200 Subject: [PATCH 184/258] fix glow-tts `inference()` --- TTS/tts/models/glow_tts.py | 5 +++- TTS/tts/utils/synthesis.py | 31 +++++++++++++++++++++-- tests/inference_tests/test_synthesizer.py | 3 ++- tests/tts_tests/test_tacotron2_model.py | 4 +-- tests/tts_tests/test_tacotron2_train.py | 1 - 5 files changed, 36 insertions(+), 8 deletions(-) diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 9c928a67..3b3207f0 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -290,7 +290,10 @@ class GlowTTS(nn.Module): return outputs @torch.no_grad() - def inference(self, x, x_lengths, cond_input={"d_vectors": None}): # pylint: disable=dangerous-default-value + def inference( + self, x, cond_input={"x_lengths": None, "d_vectors": None} + ): # pylint: disable=dangerous-default-value + x_lengths = cond_input["x_lengths"] g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None if g is not None: if self.d_vector_dim: diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 04fef715..72eff2e5 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -1,8 +1,10 @@ import os +from typing import Dict import numpy as np import pkg_resources import torch +from torch import nn from .text import phoneme_to_sequence, text_to_sequence @@ -65,9 +67,34 @@ def compute_style_mel(style_wav, ap, cuda=False): return style_mel -def run_model_torch(model, inputs, speaker_id=None, style_mel=None, d_vector=None): +def run_model_torch( + model: nn.Module, + inputs: torch.Tensor, + speaker_id: int = None, + style_mel: torch.Tensor = None, + d_vector: torch.Tensor = None, +) -> Dict: + """Run a torch model for inference. It does not support batch inference. + + Args: + model (nn.Module): The model to run inference. + inputs (torch.Tensor): Input tensor with character ids. + speaker_id (int, optional): Input speaker ids for multi-speaker models. Defaults to None. + style_mel (torch.Tensor, optional): Spectrograms used for voice styling . Defaults to None. + d_vector (torch.Tensor, optional): d-vector for multi-speaker models . Defaults to None. + + Returns: + Dict: model outputs. + """ + input_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) outputs = model.inference( - inputs, cond_input={"speaker_ids": speaker_id, "d_vector": d_vector, "style_mel": style_mel} + inputs, + cond_input={ + "x_lengths": input_lengths, + "speaker_ids": speaker_id, + "d_vectors": d_vector, + "style_mel": style_mel, + }, ) return outputs diff --git a/tests/inference_tests/test_synthesizer.py b/tests/inference_tests/test_synthesizer.py index b0fa22d3..4379c8ca 100644 --- a/tests/inference_tests/test_synthesizer.py +++ b/tests/inference_tests/test_synthesizer.py @@ -1,13 +1,14 @@ import os import unittest -from tests import get_tests_output_path from TTS.config import load_config from TTS.tts.models import setup_model from TTS.tts.utils.io import save_checkpoint from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.utils.synthesizer import Synthesizer +from .. import get_tests_output_path + class SynthesizerTest(unittest.TestCase): # pylint: disable=R0201 diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index b77f7cc5..66372470 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -259,9 +259,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55, use_gst=True, gst=c.gst).to( - device - ) + model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55, use_gst=True, gst=c.gst).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py index 70975490..577de014 100644 --- a/tests/tts_tests/test_tacotron2_train.py +++ b/tests/tts_tests/test_tacotron2_train.py @@ -8,7 +8,6 @@ from TTS.tts.configs import Tacotron2Config config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") - config = Tacotron2Config( r=5, batch_size=8, From fdfb18d23056f7683e077a9f972befe74d7e5530 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 6 Jun 2021 13:42:52 +0200 Subject: [PATCH 185/258] downsize melgan test model size --- TTS/tts/datasets/formatters.py | 1 - tests/vocoder_tests/test_melgan_train.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 3cb37168..db7841f4 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -112,7 +112,6 @@ def ljspeech_test(root_path, meta_file): https://keithito.com/LJ-Speech-Dataset/""" txt_file = os.path.join(root_path, meta_file) items = [] - speaker_name = "ljspeech" with open(txt_file, "r", encoding="utf-8") as ttf: for idx, line in enumerate(ttf): cols = line.split("|") diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py index 3ff65b5a..551b786a 100644 --- a/tests/vocoder_tests/test_melgan_train.py +++ b/tests/vocoder_tests/test_melgan_train.py @@ -19,7 +19,7 @@ config = MelganConfig( seq_len=2048, eval_split_size=1, print_step=1, - discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, + discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, print_eval=True, data_path="tests/data/ljspeech", output_path=output_path, From 03494ad6428868ece287ca922715add4f3bd0fe6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 6 Jun 2021 15:20:17 +0200 Subject: [PATCH 186/258] adjust `distribute.py` for the `train_tts.py` --- TTS/bin/distribute.py | 3 +- TTS/trainer.py | 52 +++++++++++++------ TTS/tts/utils/synthesis.py | 6 ++- .../ljspeech/tacotron2-DDC/tacotron2-DDC.json | 7 ++- 4 files changed, 48 insertions(+), 20 deletions(-) diff --git a/TTS/bin/distribute.py b/TTS/bin/distribute.py index ea43f88b..20d4bb20 100644 --- a/TTS/bin/distribute.py +++ b/TTS/bin/distribute.py @@ -30,7 +30,7 @@ def main(): parser.add_argument( "--config_path", type=str, help="Path to config file for training.", required="--continue_path" not in sys.argv ) - args = parser.parse_args() + args, unargs = parser.parse_known_args() num_gpus = torch.cuda.device_count() group_id = time.strftime("%Y_%m_%d-%H%M%S") @@ -42,6 +42,7 @@ def main(): command.append("--restore_path={}".format(args.restore_path)) command.append("--config_path={}".format(args.config_path)) command.append("--group_id=group_{}".format(group_id)) + command += unargs command.append("") # run processes diff --git a/TTS/trainer.py b/TTS/trainer.py index 9fe2f108..76c741b1 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -65,15 +65,19 @@ class TrainerTTS: self, args: Union[Coqpit, Namespace], config: Coqpit, - c_logger: ConsoleLogger, - tb_logger: TensorboardLogger, + c_logger: ConsoleLogger = None, + tb_logger: TensorboardLogger = None, model: nn.Module = None, output_path: str = None, ) -> None: self.args = args self.config = config - self.c_logger = c_logger - self.tb_logger = tb_logger + self.c_logger = ConsoleLogger() if c_logger is None else c_logger + if tb_logger is None: + self.tb_logger = TensorboardLogger(output_path, model_name=config.model) + self.tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) + else: + self.tb_logger = tb_logger self.output_path = output_path self.total_steps_done = 0 @@ -117,20 +121,20 @@ class TrainerTTS: # setup criterion self.criterion = self.get_criterion(self.config) - if self.use_cuda: - self.model.cuda() - self.criterion.cuda() - # DISTRUBUTED if self.num_gpus > 1: init_distributed( args.rank, self.num_gpus, args.group_id, - self.config.distributed["backend"], - self.config.distributed["url"], + self.config.distributed_backend, + self.config.distributed_url, ) + if self.use_cuda: + self.model.cuda() + self.criterion.cuda() + # scalers for mixed precision training self.scaler = torch.cuda.amp.GradScaler() if self.config.mixed_precision and self.use_cuda else None @@ -147,7 +151,7 @@ class TrainerTTS: # DISTRUBUTED if self.num_gpus > 1: - self.model = DDP_th(self.model, device_ids=[args.rank]) + self.model = DDP_th(self.model, device_ids=[args.rank], output_device=args.rank) # count model size num_params = count_parameters(self.model) @@ -377,6 +381,11 @@ class TrainerTTS: "item_idx": item_idx, } + def _train_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: + if hasattr(self.model, "module"): + return self.model.module.train_step(batch, criterion) + return self.model.train_step(batch, criterion) + def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_time: float) -> Tuple[Dict, Dict]: self.on_train_step_start() step_start_time = time.time() @@ -389,7 +398,7 @@ class TrainerTTS: self.optimizer.zero_grad() with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): - outputs, loss_dict = self.model.train_step(batch, self.criterion) + outputs, loss_dict = self._train_step(batch, self.criterion) # check nan loss if torch.isnan(loss_dict["loss"]).any(): @@ -473,7 +482,10 @@ class TrainerTTS: scaler=self.scaler.state_dict() if self.config.mixed_precision else None, ) # training visualizations - figures, audios = self.model.train_log(self.ap, batch, outputs) + if hasattr(self.model, "module"): + figures, audios = self.model.module.train_log(self.ap, batch, outputs) + else: + figures, audios = self.model.train_log(self.ap, batch, outputs) self.tb_logger.tb_train_figures(self.total_steps_done, figures) self.tb_logger.tb_train_audios(self.total_steps_done, {"TrainAudio": audios}, self.ap.sample_rate) self.total_steps_done += 1 @@ -500,12 +512,17 @@ class TrainerTTS: if self.config.tb_model_param_stats: self.tb_logger.tb_model_weights(self.model, self.total_steps_done) + def _eval_step(self, batch: Dict) -> Tuple[Dict, Dict]: + if hasattr(self.model, "module"): + return self.model.module.eval_step(batch, self.criterion) + return self.model.eval_step(batch, self.criterion) + def eval_step(self, batch: Dict, step: int) -> Tuple[Dict, Dict]: with torch.no_grad(): step_start_time = time.time() with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): - outputs, loss_dict = self.model.eval_step(batch, self.criterion) + outputs, loss_dict = self._eval_step(batch) step_time = time.time() - step_start_time @@ -542,7 +559,10 @@ class TrainerTTS: outputs, _ = self.eval_step(batch, cur_step) # Plot epoch stats and samples from the last batch. if self.args.rank == 0: - figures, eval_audios = self.model.eval_log(self.ap, batch, outputs) + if hasattr(self.model, "module"): + figures, eval_audios = self.model.module.eval_log(self.ap, batch, outputs) + else: + figures, eval_audios = self.model.eval_log(self.ap, batch, outputs) self.tb_logger.tb_eval_figures(self.total_steps_done, figures) self.tb_logger.tb_eval_audios(self.total_steps_done, {"EvalAudio": eval_audios}, self.ap.sample_rate) @@ -642,7 +662,7 @@ class TrainerTTS: self.train_epoch() if self.config.run_eval: self.eval_epoch() - if epoch >= self.config.test_delay_epochs: + if epoch >= self.config.test_delay_epochs and self.args.rank < 0: self.test_run() self.c_logger.print_epoch_end( epoch, self.keep_avg_eval.avg_values if self.config.run_eval else self.keep_avg_train.avg_values diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 72eff2e5..46f919dc 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -87,7 +87,11 @@ def run_model_torch( Dict: model outputs. """ input_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) - outputs = model.inference( + if hasattr(model, "module"): + _func = model.module.inference + else: + _func = model.inference + outputs = _func( inputs, cond_input={ "x_lengths": input_lengths, diff --git a/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json b/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json index 9cdbbd3b..e3531851 100644 --- a/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json +++ b/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json @@ -36,12 +36,14 @@ "gst_num_heads": 4, "gst_num_style_tokens": 10 }, + "distributed_backend": "gloo", + "distributed_url": "tcp:\/\/localhost:54321", "model": "Tacotron2", "run_name": "ljspeech-ddc", "run_description": "tacotron2 with double decoder consistency.", "batch_size": 64, "eval_batch_size": 16, - "mixed_precision": true, + "mixed_precision": false, "loss_masking": true, "decoder_loss_alpha": 0.25, "postnet_loss_alpha": 0.25, @@ -54,6 +56,7 @@ "run_eval": true, "test_delay_epochs": 10, "test_sentences_file": null, + "max_decoder_steps": 50, "noam_schedule": true, "grad_clip": 0.05, "epochs": 1000, @@ -88,4 +91,4 @@ "phoneme_cache_path": "DEFINE THIS", "use_phonemes": false, "phoneme_language": "en-us" -} \ No newline at end of file +} From 166f0aeb9a5e0fc16231f67f468283d6b1dffa08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 7 Jun 2021 15:11:33 +0200 Subject: [PATCH 187/258] merge if branches with the same implementation --- TTS/tts/utils/synthesis.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 46f919dc..9064811a 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -248,15 +248,11 @@ def synthesis( style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=use_cuda) text_inputs = text_inputs.unsqueeze(0) - elif backend == "tf": + elif backend in ["tf", "tflite"]: # TODO: handle speaker id for tf model style_mel = numpy_to_tf(style_mel, tf.float32) text_inputs = numpy_to_tf(text_inputs, tf.int32) text_inputs = tf.expand_dims(text_inputs, 0) - elif backend == "tflite": - style_mel = numpy_to_tf(style_mel, tf.float32) - text_inputs = numpy_to_tf(text_inputs, tf.int32) - text_inputs = tf.expand_dims(text_inputs, 0) # synthesize voice if backend == "torch": outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, d_vector=d_vector) From 00c82c516d5074315412a0c57fa0f0f6247f318b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 7 Jun 2021 16:08:56 +0200 Subject: [PATCH 188/258] rename to --- TTS/bin/extract_tts_spectrograms.py | 6 ++--- TTS/trainer.py | 14 +++++----- TTS/tts/models/align_tts.py | 12 ++++----- TTS/tts/models/glow_tts.py | 22 +++++++-------- TTS/tts/models/speedy_speech.py | 12 ++++----- TTS/tts/models/tacotron.py | 26 +++++++++--------- TTS/tts/models/tacotron2.py | 28 ++++++++++---------- TTS/tts/models/tacotron_abstract.py | 6 ++--- TTS/tts/utils/synthesis.py | 2 +- TTS/utils/generic_utils.py | 2 +- recipes/kokoro/tacotron2-DDC/run.sh | 10 +++---- recipes/ljspeech/tacotron2-DDC/run.sh | 8 +++--- tests/tts_tests/test_speedy_speech_layers.py | 4 +-- tests/tts_tests/test_tacotron2_model.py | 10 +++---- tests/tts_tests/test_tacotron_model.py | 10 +++---- 15 files changed, 85 insertions(+), 87 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 24665871..975f29d9 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -136,14 +136,14 @@ def inference( speaker_c = d_vectors outputs = model.inference_with_MAS( - text_input, text_lengths, mel_input, mel_lengths, cond_input={"d_vectors": speaker_c} + text_input, text_lengths, mel_input, mel_lengths, aux_input={"d_vectors": speaker_c} ) model_output = outputs["model_outputs"] model_output = model_output.transpose(1, 2).detach().cpu().numpy() elif "tacotron" in model_name: - cond_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} - outputs = model(text_input, text_lengths, mel_input, mel_lengths, cond_input) + aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} + outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input) postnet_outputs = outputs["model_outputs"] # normalize tacotron output if model_name == "tacotron": diff --git a/TTS/trainer.py b/TTS/trainer.py index 76c741b1..c1d1c340 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -573,7 +573,7 @@ class TrainerTTS: test_audios = {} test_figures = {} test_sentences = self.config.test_sentences - cond_inputs = self._get_cond_inputs() + aux_inputs = self._get_aux_inputs() for idx, sen in enumerate(test_sentences): wav, alignment, model_outputs, _ = synthesis( self.model, @@ -581,9 +581,9 @@ class TrainerTTS: self.config, self.use_cuda, self.ap, - speaker_id=cond_inputs["speaker_id"], - d_vector=cond_inputs["d_vector"], - style_wav=cond_inputs["style_wav"], + speaker_id=aux_inputs["speaker_id"], + d_vector=aux_inputs["d_vector"], + style_wav=aux_inputs["style_wav"], enable_eos_bos_chars=self.config.enable_eos_bos_chars, use_griffin_lim=True, do_trim_silence=False, @@ -600,7 +600,7 @@ class TrainerTTS: self.tb_logger.tb_test_audios(self.total_steps_done, test_audios, self.config.audio["sample_rate"]) self.tb_logger.tb_test_figures(self.total_steps_done, test_figures) - def _get_cond_inputs(self) -> Dict: + def _get_aux_inputs(self) -> Dict: # setup speaker_id speaker_id = 0 if self.config.use_speaker_embedding else None # setup d_vector @@ -620,8 +620,8 @@ class TrainerTTS: print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") for i in range(self.config.gst["gst_num_style_tokens"]): style_wav[str(i)] = 0 - cond_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector} - return cond_inputs + aux_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector} + return aux_inputs def fit(self) -> None: if self.restore_step != 0 or self.args.best_path: diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 20b0cdf7..6c268a43 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -212,7 +212,7 @@ class AlignTTS(nn.Module): return dr_mas, mu, log_sigma, logp def forward( - self, x, x_lengths, y, y_lengths, cond_input={"d_vectors": None}, phase=None + self, x, x_lengths, y, y_lengths, aux_input={"d_vectors": None}, phase=None ): # pylint: disable=unused-argument """ Shapes: @@ -223,7 +223,7 @@ class AlignTTS(nn.Module): g: [B, C] """ y = y.transpose(1, 2) - g = cond_input["d_vectors"] if "d_vectors" in cond_input else None + g = aux_input["d_vectors"] if "d_vectors" in aux_input else None o_de, o_dr_log, dr_mas_log, attn, mu, log_sigma, logp = None, None, None, None, None, None, None if phase == 0: # train encoder and MDN @@ -267,14 +267,14 @@ class AlignTTS(nn.Module): return outputs @torch.no_grad() - def inference(self, x, cond_input={"d_vectors": None}): # pylint: disable=unused-argument + def inference(self, x, aux_input={"d_vectors": None}): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] x_lengths: [B] g: [B, C] """ - g = cond_input["d_vectors"] if "d_vectors" in cond_input else None + g = aux_input["d_vectors"] if "d_vectors" in aux_input else None x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # pad input to prevent dropping the last word # x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0) @@ -296,8 +296,8 @@ class AlignTTS(nn.Module): d_vectors = batch["d_vectors"] speaker_ids = batch["speaker_ids"] - cond_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids} - outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input, self.phase) + aux_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids} + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input, self.phase) loss_dict = criterion( outputs["logp"], outputs["model_outputs"], diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 3b3207f0..e61b80c2 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -144,7 +144,7 @@ class GlowTTS(nn.Module): return y_mean, y_log_scale, o_attn_dur def forward( - self, x, x_lengths, y, y_lengths=None, cond_input={"d_vectors": None} + self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None} ): # pylint: disable=dangerous-default-value """ Shapes: @@ -157,7 +157,7 @@ class GlowTTS(nn.Module): y = y.transpose(1, 2) y_max_length = y.size(2) # norm speaker embeddings - g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None + g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None if g is not None: if self.d_vector_dim: g = F.normalize(g).unsqueeze(-1) @@ -197,7 +197,7 @@ class GlowTTS(nn.Module): @torch.no_grad() def inference_with_MAS( - self, x, x_lengths, y=None, y_lengths=None, cond_input={"d_vectors": None} + self, x, x_lengths, y=None, y_lengths=None, aux_input={"d_vectors": None} ): # pylint: disable=dangerous-default-value """ It's similar to the teacher forcing in Tacotron. @@ -212,7 +212,7 @@ class GlowTTS(nn.Module): y = y.transpose(1, 2) y_max_length = y.size(2) # norm speaker embeddings - g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None + g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None if g is not None: if self.external_d_vector_dim: g = F.normalize(g).unsqueeze(-1) @@ -258,7 +258,7 @@ class GlowTTS(nn.Module): @torch.no_grad() def decoder_inference( - self, y, y_lengths=None, cond_input={"d_vectors": None} + self, y, y_lengths=None, aux_input={"d_vectors": None} ): # pylint: disable=dangerous-default-value """ Shapes: @@ -268,7 +268,7 @@ class GlowTTS(nn.Module): """ y = y.transpose(1, 2) y_max_length = y.size(2) - g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None + g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None # norm speaker embeddings if g is not None: if self.external_d_vector_dim: @@ -290,11 +290,9 @@ class GlowTTS(nn.Module): return outputs @torch.no_grad() - def inference( - self, x, cond_input={"x_lengths": None, "d_vectors": None} - ): # pylint: disable=dangerous-default-value - x_lengths = cond_input["x_lengths"] - g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None + def inference(self, x, aux_input={"x_lengths": None, "d_vectors": None}): # pylint: disable=dangerous-default-value + x_lengths = aux_input["x_lengths"] + g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None if g is not None: if self.d_vector_dim: g = F.normalize(g).unsqueeze(-1) @@ -343,7 +341,7 @@ class GlowTTS(nn.Module): mel_lengths = batch["mel_lengths"] d_vectors = batch["d_vectors"] - outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input={"d_vectors": d_vectors}) + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input={"d_vectors": d_vectors}) loss_dict = criterion( outputs["model_outputs"], diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index 53f7bbaa..d4a90a2e 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -157,7 +157,7 @@ class SpeedySpeech(nn.Module): return o_de, attn.transpose(1, 2) def forward( - self, x, x_lengths, y_lengths, dr, cond_input={"d_vectors": None, "speaker_ids": None} + self, x, x_lengths, y_lengths, dr, aux_input={"d_vectors": None, "speaker_ids": None} ): # pylint: disable=unused-argument """ TODO: speaker embedding for speaker_ids @@ -168,21 +168,21 @@ class SpeedySpeech(nn.Module): dr: [B, T_max] g: [B, C] """ - g = cond_input["d_vectors"] if "d_vectors" in cond_input else None + g = aux_input["d_vectors"] if "d_vectors" in aux_input else None o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) o_de, attn = self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g) outputs = {"model_outputs": o_de.transpose(1, 2), "durations_log": o_dr_log.squeeze(1), "alignments": attn} return outputs - def inference(self, x, cond_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=unused-argument + def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] x_lengths: [B] g: [B, C] """ - g = cond_input["d_vectors"] if "d_vectors" in cond_input else None + g = aux_input["d_vectors"] if "d_vectors" in aux_input else None x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # input sequence should be greated than the max convolution size inference_padding = 5 @@ -208,8 +208,8 @@ class SpeedySpeech(nn.Module): speaker_ids = batch["speaker_ids"] durations = batch["durations"] - cond_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids} - outputs = self.forward(text_input, text_lengths, mel_lengths, durations, cond_input) + aux_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids} + outputs = self.forward(text_input, text_lengths, mel_lengths, durations, aux_input) # compute loss loss_dict = criterion( diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 3ee70431..317d1905 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -186,14 +186,14 @@ class Tacotron(TacotronAbstract): max_decoder_steps, ) - def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, cond_input=None): + def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, aux_input=None): """ Shapes: text: [B, T_in] text_lengths: [B] mel_specs: [B, T_out, C] mel_lengths: [B] - cond_input: 'speaker_ids': [B, 1] and 'd_vectors':[B, C] + aux_input: 'speaker_ids': [B, 1] and 'd_vectors':[B, C] """ outputs = {"alignments_backward": None, "decoder_outputs_backward": None} inputs = self.embedding(text) @@ -205,15 +205,15 @@ class Tacotron(TacotronAbstract): # global style token if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["d_vectors"]) + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, aux_input["d_vectors"]) # speaker embedding if self.num_speakers > 1: if not self.use_d_vectors: # B x 1 x speaker_embed_dim - embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[:, None] + embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])[:, None] else: # B x 1 x speaker_embed_dim - embedded_speakers = torch.unsqueeze(cond_input["d_vectors"], 1) + embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1) encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) # decoder_outputs: B x decoder_in_features x T_out # alignments: B x T_in x encoder_in_features @@ -252,17 +252,17 @@ class Tacotron(TacotronAbstract): return outputs @torch.no_grad() - def inference(self, text_input, cond_input=None): - cond_input = self._format_cond_input(cond_input) + def inference(self, text_input, aux_input=None): + aux_input = self._format_aux_input(aux_input) inputs = self.embedding(text_input) encoder_outputs = self.encoder(inputs) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["d_vectors"]) + encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"]) if self.num_speakers > 1: if not self.use_d_vectors: # B x 1 x speaker_embed_dim - embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"]) + embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"]) # reshape embedded_speakers if embedded_speakers.ndim == 1: embedded_speakers = embedded_speakers[None, None, :] @@ -270,7 +270,7 @@ class Tacotron(TacotronAbstract): embedded_speakers = embedded_speakers[None, :] else: # B x 1 x speaker_embed_dim - embedded_speakers = torch.unsqueeze(cond_input["d_vectors"], 1) + embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1) encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) postnet_outputs = self.postnet(decoder_outputs) @@ -306,7 +306,7 @@ class Tacotron(TacotronAbstract): text_lengths, mel_input, mel_lengths, - cond_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors}, + aux_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors}, ) # set the [alignment] lengths wrt reduction factor for guided attention @@ -317,8 +317,8 @@ class Tacotron(TacotronAbstract): else: alignment_lengths = mel_lengths // self.decoder.r - cond_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} - outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input) + aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input) # compute loss loss_dict = criterion( diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index f6e59542..d56bd988 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -186,16 +186,16 @@ class Tacotron2(TacotronAbstract): mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2) return mel_outputs, mel_outputs_postnet, alignments - def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, cond_input=None): + def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, aux_input=None): """ Shapes: text: [B, T_in] text_lengths: [B] mel_specs: [B, T_out, C] mel_lengths: [B] - cond_input: 'speaker_ids': [B, 1] and 'd_vectors':[B, C] + aux_input: 'speaker_ids': [B, 1] and 'd_vectors':[B, C] """ - cond_input = self._format_cond_input(cond_input) + aux_input = self._format_aux_input(aux_input) outputs = {"alignments_backward": None, "decoder_outputs_backward": None} # compute mask for padding # B x T_in_max (boolean) @@ -206,14 +206,14 @@ class Tacotron2(TacotronAbstract): encoder_outputs = self.encoder(embedded_inputs, text_lengths) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["d_vectors"]) + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, aux_input["d_vectors"]) if self.num_speakers > 1: if not self.use_d_vectors: # B x 1 x speaker_embed_dim - embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[:, None] + embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])[:, None] else: # B x 1 x speaker_embed_dim - embedded_speakers = torch.unsqueeze(cond_input["d_vectors"], 1) + embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1) encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) @@ -252,24 +252,24 @@ class Tacotron2(TacotronAbstract): return outputs @torch.no_grad() - def inference(self, text, cond_input=None): - cond_input = self._format_cond_input(cond_input) + def inference(self, text, aux_input=None): + aux_input = self._format_aux_input(aux_input) embedded_inputs = self.embedding(text).transpose(1, 2) encoder_outputs = self.encoder.inference(embedded_inputs) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["d_vectors"]) + encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"]) if self.num_speakers > 1: if not self.use_d_vectors: - embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[None] + embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])[None] # reshape embedded_speakers if embedded_speakers.ndim == 1: embedded_speakers = embedded_speakers[None, None, :] elif embedded_speakers.ndim == 2: embedded_speakers = embedded_speakers[None, :] else: - embedded_speakers = cond_input["d_vectors"] + embedded_speakers = aux_input["d_vectors"] encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) @@ -307,7 +307,7 @@ class Tacotron2(TacotronAbstract): text_lengths, mel_input, mel_lengths, - cond_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors}, + aux_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors}, ) # set the [alignment] lengths wrt reduction factor for guided attention @@ -318,8 +318,8 @@ class Tacotron2(TacotronAbstract): else: alignment_lengths = mel_lengths // self.decoder.r - cond_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} - outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input) + aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input) # compute loss loss_dict = criterion( diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py index e480e2e0..705ea5bc 100644 --- a/TTS/tts/models/tacotron_abstract.py +++ b/TTS/tts/models/tacotron_abstract.py @@ -6,7 +6,7 @@ import torch from torch import nn from TTS.tts.utils.data import sequence_mask -from TTS.utils.generic_utils import format_cond_input +from TTS.utils.generic_utils import format_aux_input from TTS.utils.training import gradual_training_scheduler @@ -97,8 +97,8 @@ class TacotronAbstract(ABC, nn.Module): self.coarse_decoder = None @staticmethod - def _format_cond_input(cond_input: Dict) -> Dict: - return format_cond_input({"d_vectors": None, "speaker_ids": None}, cond_input) + def _format_aux_input(aux_input: Dict) -> Dict: + return format_aux_input({"d_vectors": None, "speaker_ids": None}, aux_input) ############################# # INIT FUNCTIONS diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 9064811a..39474cab 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -93,7 +93,7 @@ def run_model_torch( _func = model.inference outputs = _func( inputs, - cond_input={ + aux_input={ "x_lengths": input_lengths, "speaker_ids": speaker_id, "d_vectors": d_vector, diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index a1abf5fe..67cd0bf5 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -136,7 +136,7 @@ def set_init_dict(model_dict, checkpoint_state, c): return model_dict -def format_cond_input(def_args: Dict, kwargs: Dict) -> Dict: +def format_aux_input(def_args: Dict, kwargs: Dict) -> Dict: """Format kwargs to hande auxilary inputs to models. Args: diff --git a/recipes/kokoro/tacotron2-DDC/run.sh b/recipes/kokoro/tacotron2-DDC/run.sh index 86fda642..69800cf7 100644 --- a/recipes/kokoro/tacotron2-DDC/run.sh +++ b/recipes/kokoro/tacotron2-DDC/run.sh @@ -16,8 +16,8 @@ tail -n 812 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_val.c python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/$CORPUS/wavs/ # training .... # change the GPU id if needed -CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tacotron.py --config_path $RUN_DIR/tacotron2-DDC.json \ - --coqpit.output_path $RUN_DIR \ - --coqpit.datasets.0.path $RUN_DIR/$CORPUS \ - --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ - --coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \ \ No newline at end of file +CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path $RUN_DIR/tacotron2-DDC.json \ + --coqpit.output_path $RUN_DIR \ + --coqpit.datasets.0.path $RUN_DIR/$CORPUS \ + --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ + --coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \ \ No newline at end of file diff --git a/recipes/ljspeech/tacotron2-DDC/run.sh b/recipes/ljspeech/tacotron2-DDC/run.sh index eaa05b60..dd36454f 100644 --- a/recipes/ljspeech/tacotron2-DDC/run.sh +++ b/recipes/ljspeech/tacotron2-DDC/run.sh @@ -16,7 +16,7 @@ rm LJSpeech-1.1.tar.bz2 python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/LJSpeech-1.1/wavs/ # training .... # change the GPU id if needed -CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tacotron.py --config_path $RUN_DIR/tacotron2-DDC.json \ - --coqpit.output_path $RUN_DIR \ - --coqpit.datasets.0.path $RUN_DIR/LJSpeech-1.1/ \ - --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ \ No newline at end of file +CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path $RUN_DIR/tacotron2-DDC.json \ + --coqpit.output_path $RUN_DIR \ + --coqpit.datasets.0.path $RUN_DIR/LJSpeech-1.1/ \ + --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ \ No newline at end of file diff --git a/tests/tts_tests/test_speedy_speech_layers.py b/tests/tts_tests/test_speedy_speech_layers.py index 7c4f0adf..d2f62d49 100644 --- a/tests/tts_tests/test_speedy_speech_layers.py +++ b/tests/tts_tests/test_speedy_speech_layers.py @@ -57,7 +57,7 @@ def test_speedy_speech(): # with speaker embedding model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128, num_speakers=10, c_in_channels=256).to(device) model.forward( - x_dummy, x_lengths, y_lengths, durations, cond_input={"d_vectors": torch.randint(0, 10, (B,)).to(device)} + x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.randint(0, 10, (B,)).to(device)} ) o_de = outputs["model_outputs"] attn = outputs["alignments"] @@ -71,7 +71,7 @@ def test_speedy_speech(): model = SpeedySpeech( num_chars, out_channels=80, hidden_channels=128, num_speakers=10, external_c=True, c_in_channels=256 ).to(device) - model.forward(x_dummy, x_lengths, y_lengths, durations, cond_input={"d_vectors": torch.rand((B, 256)).to(device)}) + model.forward(x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.rand((B, 256)).to(device)}) o_de = outputs["model_outputs"] attn = outputs["alignments"] o_dr = outputs["durations_log"] diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index 66372470..fc3d9799 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -53,7 +53,7 @@ class TacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} ) assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 @@ -105,7 +105,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_ids} + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_ids} ) assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 @@ -158,7 +158,7 @@ class TacotronGSTTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(10): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} ) assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 @@ -214,7 +214,7 @@ class TacotronGSTTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(10): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} ) assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 @@ -269,7 +269,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_embeddings} + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings} ) assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py index 31682d7a..2abd968d 100644 --- a/tests/tts_tests/test_tacotron_model.py +++ b/tests/tts_tests/test_tacotron_model.py @@ -69,7 +69,7 @@ class TacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} ) optimizer.zero_grad() loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) @@ -130,7 +130,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_embeddings} + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings} ) optimizer.zero_grad() loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) @@ -194,7 +194,7 @@ class TacotronGSTTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(10): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} ) optimizer.zero_grad() loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) @@ -257,7 +257,7 @@ class TacotronGSTTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(10): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"speaker_ids": speaker_ids} + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} ) optimizer.zero_grad() loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) @@ -319,7 +319,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_embeddings} + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings} ) optimizer.zero_grad() loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) From c754a0e17daf7e8f334f02a5581f72c25803ed97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 8 Jun 2021 14:18:18 +0200 Subject: [PATCH 189/258] `TrainerAbstract` and related updates for `TrainerTTS` --- TTS/bin/train_tts.py | 2 +- TTS/trainer.py | 714 ++++------------------------------------- TTS/tts/trainer_tts.py | 709 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 771 insertions(+), 654 deletions(-) create mode 100644 TTS/tts/trainer_tts.py diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 3270d0e0..06765906 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -2,7 +2,7 @@ import os import sys import traceback -from TTS.trainer import TrainerTTS +from TTS.tts.trainer_tts import TrainerTTS from TTS.utils.arguments import init_training from TTS.utils.generic_utils import remove_experiment_folder diff --git a/TTS/trainer.py b/TTS/trainer.py index c1d1c340..5c02fdfb 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -1,39 +1,23 @@ # -*- coding: utf-8 -*- import importlib -import logging -import os -import time -from argparse import Namespace +from abc import ABC, abstractmethod from dataclasses import dataclass, field -from typing import Dict, List, Tuple, Union +from typing import Dict, List, Tuple, TypeVar import torch from coqpit import Coqpit # DISTRIBUTED from torch import nn -from torch.nn.parallel import DistributedDataParallel as DDP_th -from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler -from TTS.tts.datasets import TTSDataset, load_meta_data -from TTS.tts.layers import setup_loss -from TTS.tts.models import setup_model -from TTS.tts.utils.io import save_best_model, save_checkpoint -from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager -from TTS.tts.utils.synthesis import synthesis -from TTS.tts.utils.text.symbols import make_symbols -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import init_distributed -from TTS.utils.generic_utils import KeepAverage, count_parameters, set_init_dict, to_cuda -from TTS.utils.logging import ConsoleLogger, TensorboardLogger -from TTS.utils.training import check_update, setup_torch_training_env +_DataLoader = TypeVar("_DataLoader") @dataclass class TrainingArgs(Coqpit): + """Trainer arguments that are parsed externally (e.g. CLI)""" + continue_path: str = field( default="", metadata={ @@ -58,676 +42,100 @@ class TrainingArgs(Coqpit): # pylint: disable=import-outside-toplevel, too-many-public-methods -class TrainerTTS: - use_cuda, num_gpus = setup_torch_training_env(True, False) - def __init__( - self, - args: Union[Coqpit, Namespace], - config: Coqpit, - c_logger: ConsoleLogger = None, - tb_logger: TensorboardLogger = None, - model: nn.Module = None, - output_path: str = None, - ) -> None: - self.args = args - self.config = config - self.c_logger = ConsoleLogger() if c_logger is None else c_logger - if tb_logger is None: - self.tb_logger = TensorboardLogger(output_path, model_name=config.model) - self.tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) - else: - self.tb_logger = tb_logger - self.output_path = output_path - self.total_steps_done = 0 - self.epochs_done = 0 - self.restore_step = 0 - self.best_loss = float("inf") - self.train_loader = None - self.eval_loader = None - self.output_audio_path = os.path.join(output_path, "test_audios") - - self.keep_avg_train = None - self.keep_avg_eval = None - - log_file = os.path.join(self.output_path, f"trainer_{args.rank}_log.txt") - self._setup_logger_config(log_file) - - # model, audio processor, datasets, loss - # init audio processor - self.ap = AudioProcessor(**self.config.audio.to_dict()) - - # init character processor - self.model_characters = self.get_character_processor(self.config) - - # load dataset samples - self.data_train, self.data_eval = load_meta_data(self.config.datasets) - - # default speaker manager - self.speaker_manager = self.get_speaker_manager(self.config, args.restore_path, output_path, self.data_train) - - # init TTS model - if model is not None: - self.model = model - else: - self.model = self.get_model( - len(self.model_characters), - self.speaker_manager.num_speakers, - self.config, - self.speaker_manager.d_vector_dim if self.speaker_manager.d_vectors else None, - ) - - # setup criterion - self.criterion = self.get_criterion(self.config) - - # DISTRUBUTED - if self.num_gpus > 1: - init_distributed( - args.rank, - self.num_gpus, - args.group_id, - self.config.distributed_backend, - self.config.distributed_url, - ) - - if self.use_cuda: - self.model.cuda() - self.criterion.cuda() - - # scalers for mixed precision training - self.scaler = torch.cuda.amp.GradScaler() if self.config.mixed_precision and self.use_cuda else None - - # setup optimizer - self.optimizer = self.get_optimizer(self.model, self.config) - - if self.args.restore_path: - self.model, self.optimizer, self.scaler, self.restore_step = self.restore_model( - self.config, args.restore_path, self.model, self.optimizer, self.scaler - ) - - # setup scheduler - self.scheduler = self.get_scheduler(self.config, self.optimizer) - - # DISTRUBUTED - if self.num_gpus > 1: - self.model = DDP_th(self.model, device_ids=[args.rank], output_device=args.rank) - - # count model size - num_params = count_parameters(self.model) - print("\n > Model has {} parameters".format(num_params)) +class TrainerAbstract(ABC): @staticmethod - def get_model(num_chars: int, num_speakers: int, config: Coqpit, d_vector_dim: int) -> nn.Module: - model = setup_model(num_chars, num_speakers, config, d_vector_dim) - return model + def _is_apex_available(): + return importlib.util.find_spec("apex") is not None @staticmethod + @abstractmethod + def get_model(*args, **kwargs) -> nn.Module: + pass + + @staticmethod + @abstractmethod def get_optimizer(model: nn.Module, config: Coqpit) -> torch.optim.Optimizer: - optimizer_name = config.optimizer - optimizer_params = config.optimizer_params - if optimizer_name.lower() == "radam": - module = importlib.import_module("TTS.utils.radam") - optimizer = getattr(module, "RAdam") - else: - optimizer = getattr(torch.optim, optimizer_name) - return optimizer(model.parameters(), lr=config.lr, **optimizer_params) - - @staticmethod - def get_character_processor(config: Coqpit) -> str: - # setup custom characters if set in config file. - # TODO: implement CharacterProcessor - if config.characters is not None: - symbols, phonemes = make_symbols(**config.characters.to_dict()) - else: - from TTS.tts.utils.text.symbols import phonemes, symbols - model_characters = phonemes if config.use_phonemes else symbols - return model_characters - - @staticmethod - def get_speaker_manager( - config: Coqpit, restore_path: str = "", out_path: str = "", data_train: List = None - ) -> SpeakerManager: - speaker_manager = get_speaker_manager(config, restore_path, data_train, out_path) - return speaker_manager + pass @staticmethod + @abstractmethod def get_scheduler( config: Coqpit, optimizer: torch.optim.Optimizer ) -> torch.optim.lr_scheduler._LRScheduler: # pylint: disable=protected-access - lr_scheduler = config.lr_scheduler - lr_scheduler_params = config.lr_scheduler_params - if lr_scheduler is None: - return None - if lr_scheduler.lower() == "noamlr": - from TTS.utils.training import NoamLR - - scheduler = NoamLR - else: - scheduler = getattr(torch.optim, lr_scheduler) - return scheduler(optimizer, **lr_scheduler_params) + pass @staticmethod + @abstractmethod def get_criterion(config: Coqpit) -> nn.Module: - return setup_loss(config) + pass - def restore_model( - self, - config: Coqpit, - restore_path: str, - model: nn.Module, - optimizer: torch.optim.Optimizer, - scaler: torch.cuda.amp.GradScaler = None, - ) -> Tuple[nn.Module, torch.optim.Optimizer, torch.cuda.amp.GradScaler, int]: - print(" > Restoring from %s ..." % os.path.basename(restore_path)) - checkpoint = torch.load(restore_path) - try: - print(" > Restoring Model...") - model.load_state_dict(checkpoint["model"]) - print(" > Restoring Optimizer...") - optimizer.load_state_dict(checkpoint["optimizer"]) - if "scaler" in checkpoint and config.mixed_precision: - print(" > Restoring AMP Scaler...") - scaler.load_state_dict(checkpoint["scaler"]) - except (KeyError, RuntimeError): - print(" > Partial model initialization...") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], config) - model.load_state_dict(model_dict) - del model_dict + @abstractmethod + def restore_model(self, *args, **kwargs) -> Tuple: + pass - for group in optimizer.param_groups: - group["lr"] = self.config.lr - print( - " > Model restored from step %d" % checkpoint["step"], - ) - restore_step = checkpoint["step"] - return model, optimizer, scaler, restore_step + @abstractmethod + def get_train_dataloader(self, *args, **kwargs) -> _DataLoader: + pass - def _get_loader( - self, - r: int, - ap: AudioProcessor, - is_eval: bool, - data_items: List, - verbose: bool, - speaker_ids: Union[Dict, List], - d_vectors: Union[Dict, List], - ) -> DataLoader: - if is_eval and not self.config.run_eval: - loader = None - else: - dataset = TTSDataset( - outputs_per_step=r, - text_cleaner=self.config.text_cleaner, - compute_linear_spec=self.config.model.lower() == "tacotron", - meta_data=data_items, - ap=ap, - tp=self.config.characters, - add_blank=self.config["add_blank"], - batch_group_size=0 if is_eval else self.config.batch_group_size * self.config.batch_size, - min_seq_len=self.config.min_seq_len, - max_seq_len=self.config.max_seq_len, - phoneme_cache_path=self.config.phoneme_cache_path, - use_phonemes=self.config.use_phonemes, - phoneme_language=self.config.phoneme_language, - enable_eos_bos=self.config.enable_eos_bos_chars, - use_noise_augment=not is_eval, - verbose=verbose, - speaker_id_mapping=speaker_ids if self.config.use_speaker_embedding else None, - d_vector_mapping=d_vectors - if self.config.use_speaker_embedding and self.config.use_external_speaker_embedding_file - else None, - ) - - if self.config.use_phonemes and self.config.compute_input_seq_cache: - # precompute phonemes to have a better estimate of sequence lengths. - dataset.compute_input_seq(self.config.num_loader_workers) - dataset.sort_items() - - sampler = DistributedSampler(dataset) if self.num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=self.config.eval_batch_size if is_eval else self.config.batch_size, - shuffle=False, - collate_fn=dataset.collate_fn, - drop_last=False, - sampler=sampler, - num_workers=self.config.num_val_loader_workers if is_eval else self.config.num_loader_workers, - pin_memory=False, - ) - return loader - - def get_train_dataloader( - self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_ids: Dict, d_vectors: Dict - ) -> DataLoader: - return self._get_loader(r, ap, False, data_items, verbose, speaker_ids, d_vectors) - - def get_eval_dataloder( - self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_ids: Dict, d_vectors: Dict - ) -> DataLoader: - return self._get_loader(r, ap, True, data_items, verbose, speaker_ids, d_vectors) + @abstractmethod + def get_eval_dataloder(self, *args, **kwargs) -> _DataLoader: + pass + @abstractmethod def format_batch(self, batch: List) -> Dict: - # setup input batch - text_input = batch[0] - text_lengths = batch[1] - speaker_names = batch[2] - linear_input = batch[3] if self.config.model.lower() in ["tacotron"] else None - mel_input = batch[4] - mel_lengths = batch[5] - stop_targets = batch[6] - item_idx = batch[7] - d_vectors = batch[8] - speaker_ids = batch[9] - attn_mask = batch[10] - max_text_length = torch.max(text_lengths.float()) - max_spec_length = torch.max(mel_lengths.float()) - - # compute durations from attention masks - durations = None - if attn_mask is not None: - durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2]) - for idx, am in enumerate(attn_mask): - # compute raw durations - c_idxs = am[:, : text_lengths[idx], : mel_lengths[idx]].max(1)[1] - # c_idxs, counts = torch.unique_consecutive(c_idxs, return_counts=True) - c_idxs, counts = torch.unique(c_idxs, return_counts=True) - dur = torch.ones([text_lengths[idx]]).to(counts.dtype) - dur[c_idxs] = counts - # smooth the durations and set any 0 duration to 1 - # by cutting off from the largest duration indeces. - extra_frames = dur.sum() - mel_lengths[idx] - largest_idxs = torch.argsort(-dur)[:extra_frames] - dur[largest_idxs] -= 1 - assert ( - dur.sum() == mel_lengths[idx] - ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" - durations[idx, : text_lengths[idx]] = dur - - # set stop targets view, we predict a single stop token per iteration. - stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // self.config.r, -1) - stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) - - # dispatch batch to GPU - if self.use_cuda: - text_input = to_cuda(text_input) - text_lengths = to_cuda(text_lengths) - mel_input = to_cuda(mel_input) - mel_lengths = to_cuda(mel_lengths) - linear_input = to_cuda(linear_input) if self.config.model.lower() in ["tacotron"] else None - stop_targets = to_cuda(stop_targets) - attn_mask = to_cuda(attn_mask) if attn_mask is not None else None - durations = to_cuda(durations) if attn_mask is not None else None - if speaker_ids is not None: - speaker_ids = to_cuda(speaker_ids) - if d_vectors is not None: - d_vectors = to_cuda(d_vectors) - - return { - "text_input": text_input, - "text_lengths": text_lengths, - "speaker_names": speaker_names, - "mel_input": mel_input, - "mel_lengths": mel_lengths, - "linear_input": linear_input, - "stop_targets": stop_targets, - "attn_mask": attn_mask, - "durations": durations, - "speaker_ids": speaker_ids, - "d_vectors": d_vectors, - "max_text_length": max_text_length, - "max_spec_length": max_spec_length, - "item_idx": item_idx, - } + pass + @abstractmethod def _train_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: - if hasattr(self.model, "module"): - return self.model.module.train_step(batch, criterion) - return self.model.train_step(batch, criterion) + pass + @abstractmethod def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_time: float) -> Tuple[Dict, Dict]: - self.on_train_step_start() - step_start_time = time.time() - - # format data - batch = self.format_batch(batch) - loader_time = time.time() - loader_start_time - - # zero-out optimizer - self.optimizer.zero_grad() - - with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): - outputs, loss_dict = self._train_step(batch, self.criterion) - - # check nan loss - if torch.isnan(loss_dict["loss"]).any(): - raise RuntimeError(f"Detected NaN loss at step {self.total_steps_done}.") - - # optimizer step - if self.config.mixed_precision: - # model optimizer step in mixed precision mode - self.scaler.scale(loss_dict["loss"]).backward() - self.scaler.unscale_(self.optimizer) - grad_norm, _ = check_update(self.model, self.config.grad_clip, ignore_stopnet=True) - self.scaler.step(self.optimizer) - self.scaler.update() - else: - # main model optimizer step - loss_dict["loss"].backward() - grad_norm, _ = check_update(self.model, self.config.grad_clip, ignore_stopnet=True) - self.optimizer.step() - - step_time = time.time() - step_start_time - - # setup lr - if self.config.lr_scheduler: - self.scheduler.step() - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - self.keep_avg_train.update_values(update_train_values) - - # print training progress - current_lr = self.optimizer.param_groups[0]["lr"] - if self.total_steps_done % self.config.print_step == 0: - log_dict = { - "max_spec_length": [batch["max_spec_length"], 1], # value, precision - "max_text_length": [batch["max_text_length"], 1], - "step_time": [step_time, 4], - "loader_time": [loader_time, 2], - "current_lr": current_lr, - } - self.c_logger.print_train_step( - batch_n_steps, step, self.total_steps_done, log_dict, loss_dict, self.keep_avg_train.avg_values - ) - - if self.args.rank == 0: - # Plot Training Iter Stats - # reduce TB load - if self.total_steps_done % self.config.tb_plot_step == 0: - iter_stats = { - "lr": current_lr, - "grad_norm": grad_norm, - "step_time": step_time, - } - iter_stats.update(loss_dict) - self.tb_logger.tb_train_step_stats(self.total_steps_done, iter_stats) - - if self.total_steps_done % self.config.save_step == 0: - if self.config.checkpoint: - # save model - save_checkpoint( - self.model, - self.optimizer, - self.total_steps_done, - self.epochs_done, - self.config.r, - self.output_path, - model_loss=loss_dict["loss"], - characters=self.model_characters, - scaler=self.scaler.state_dict() if self.config.mixed_precision else None, - ) - # training visualizations - if hasattr(self.model, "module"): - figures, audios = self.model.module.train_log(self.ap, batch, outputs) - else: - figures, audios = self.model.train_log(self.ap, batch, outputs) - self.tb_logger.tb_train_figures(self.total_steps_done, figures) - self.tb_logger.tb_train_audios(self.total_steps_done, {"TrainAudio": audios}, self.ap.sample_rate) - self.total_steps_done += 1 - self.on_train_step_end() - return outputs, loss_dict + pass + @abstractmethod def train_epoch(self) -> None: - self.model.train() - epoch_start_time = time.time() - if self.use_cuda: - batch_num_steps = int(len(self.train_loader.dataset) / (self.config.batch_size * self.num_gpus)) - else: - batch_num_steps = int(len(self.train_loader.dataset) / self.config.batch_size) - self.c_logger.print_train_start() - loader_start_time = time.time() - for cur_step, batch in enumerate(self.train_loader): - _, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time) - epoch_time = time.time() - epoch_start_time - # Plot self.epochs_done Stats - if self.args.rank == 0: - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(self.keep_avg_train.avg_values) - self.tb_logger.tb_train_epoch_stats(self.total_steps_done, epoch_stats) - if self.config.tb_model_param_stats: - self.tb_logger.tb_model_weights(self.model, self.total_steps_done) + pass + @abstractmethod def _eval_step(self, batch: Dict) -> Tuple[Dict, Dict]: - if hasattr(self.model, "module"): - return self.model.module.eval_step(batch, self.criterion) - return self.model.eval_step(batch, self.criterion) + pass + @abstractmethod def eval_step(self, batch: Dict, step: int) -> Tuple[Dict, Dict]: - with torch.no_grad(): - step_start_time = time.time() - - with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): - outputs, loss_dict = self._eval_step(batch) - - step_time = time.time() - step_start_time - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_eval_values = dict() - for key, value in loss_dict.items(): - update_eval_values["avg_" + key] = value - update_eval_values["avg_step_time"] = step_time - self.keep_avg_eval.update_values(update_eval_values) - - if self.config.print_eval: - self.c_logger.print_eval_step(step, loss_dict, self.keep_avg_eval.avg_values) - return outputs, loss_dict + pass + @abstractmethod def eval_epoch(self) -> None: - self.model.eval() - self.c_logger.print_eval_start() - loader_start_time = time.time() - batch = None - for cur_step, batch in enumerate(self.eval_loader): - # format data - batch = self.format_batch(batch) - loader_time = time.time() - loader_start_time - self.keep_avg_eval.update_values({"avg_loader_time": loader_time}) - outputs, _ = self.eval_step(batch, cur_step) - # Plot epoch stats and samples from the last batch. - if self.args.rank == 0: - if hasattr(self.model, "module"): - figures, eval_audios = self.model.module.eval_log(self.ap, batch, outputs) - else: - figures, eval_audios = self.model.eval_log(self.ap, batch, outputs) - self.tb_logger.tb_eval_figures(self.total_steps_done, figures) - self.tb_logger.tb_eval_audios(self.total_steps_done, {"EvalAudio": eval_audios}, self.ap.sample_rate) + pass - def test_run( - self, - ) -> None: - print(" | > Synthesizing test sentences.") - test_audios = {} - test_figures = {} - test_sentences = self.config.test_sentences - aux_inputs = self._get_aux_inputs() - for idx, sen in enumerate(test_sentences): - wav, alignment, model_outputs, _ = synthesis( - self.model, - sen, - self.config, - self.use_cuda, - self.ap, - speaker_id=aux_inputs["speaker_id"], - d_vector=aux_inputs["d_vector"], - style_wav=aux_inputs["style_wav"], - enable_eos_bos_chars=self.config.enable_eos_bos_chars, - use_griffin_lim=True, - do_trim_silence=False, - ).values() - - file_path = os.path.join(self.output_audio_path, str(self.total_steps_done)) - os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) - self.ap.save_wav(wav, file_path) - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-prediction".format(idx)] = plot_spectrogram(model_outputs, self.ap, output_fig=False) - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment, output_fig=False) - - self.tb_logger.tb_test_audios(self.total_steps_done, test_audios, self.config.audio["sample_rate"]) - self.tb_logger.tb_test_figures(self.total_steps_done, test_figures) - - def _get_aux_inputs(self) -> Dict: - # setup speaker_id - speaker_id = 0 if self.config.use_speaker_embedding else None - # setup d_vector - d_vector = ( - self.speaker_manager.get_d_vectors_by_speaker(self.speaker_manager.speaker_names[0]) - if self.config.use_external_speaker_embedding_file and self.config.use_speaker_embedding - else None - ) - # setup style_mel - if self.config.has("gst_style_input"): - style_wav = self.config.gst_style_input - else: - style_wav = None - if style_wav is None and "use_gst" in self.config and self.config.use_gst: - # inicialize GST with zero dict. - style_wav = {} - print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") - for i in range(self.config.gst["gst_num_style_tokens"]): - style_wav[str(i)] = 0 - aux_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector} - return aux_inputs + @abstractmethod + def test_run(self) -> None: + pass + @abstractmethod def fit(self) -> None: - if self.restore_step != 0 or self.args.best_path: - print(" > Restoring best loss from " f"{os.path.basename(self.args.best_path)} ...") - self.best_loss = torch.load(self.args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {self.best_loss}.") - - # define data loaders - self.train_loader = self.get_train_dataloader( - self.config.r, - self.ap, - self.data_train, - verbose=True, - speaker_ids=self.speaker_manager.speaker_ids, - d_vectors=self.speaker_manager.d_vectors, - ) - self.eval_loader = ( - self.get_eval_dataloder( - self.config.r, - self.ap, - self.data_train, - verbose=True, - speaker_ids=self.speaker_manager.speaker_ids, - d_vectors=self.speaker_manager.d_vectors, - ) - if self.config.run_eval - else None - ) - - self.total_steps_done = self.restore_step - - for epoch in range(0, self.config.epochs): - self.on_epoch_start() - self.keep_avg_train = KeepAverage() - self.keep_avg_eval = KeepAverage() if self.config.run_eval else None - self.epochs_done = epoch - self.c_logger.print_epoch_start(epoch, self.config.epochs) - self.train_epoch() - if self.config.run_eval: - self.eval_epoch() - if epoch >= self.config.test_delay_epochs and self.args.rank < 0: - self.test_run() - self.c_logger.print_epoch_end( - epoch, self.keep_avg_eval.avg_values if self.config.run_eval else self.keep_avg_train.avg_values - ) - self.save_best_model() - self.on_epoch_end() + pass + @abstractmethod def save_best_model(self) -> None: - self.best_loss = save_best_model( - self.keep_avg_eval["avg_loss"] if self.keep_avg_eval else self.keep_avg_train["avg_loss"], - self.best_loss, - self.model, - self.optimizer, - self.total_steps_done, - self.epochs_done, - self.config.r, - self.output_path, - self.model_characters, - keep_all_best=self.config.keep_all_best, - keep_after=self.config.keep_after, - scaler=self.scaler.state_dict() if self.config.mixed_precision else None, - ) + pass - @staticmethod - def _setup_logger_config(log_file: str) -> None: - logging.basicConfig( - level=logging.INFO, format="", handlers=[logging.FileHandler(log_file), logging.StreamHandler()] - ) + @abstractmethod + def on_epoch_start(self) -> None: + pass - def on_epoch_start(self) -> None: # pylint: disable=no-self-use - if hasattr(self.model, "on_epoch_start"): - self.model.on_epoch_start(self) + @abstractmethod + def on_epoch_end(self) -> None: + pass - if hasattr(self.criterion, "on_epoch_start"): - self.criterion.on_epoch_start(self) + @abstractmethod + def on_train_step_start(self) -> None: + pass - if hasattr(self.optimizer, "on_epoch_start"): - self.optimizer.on_epoch_start(self) - - def on_epoch_end(self) -> None: # pylint: disable=no-self-use - if hasattr(self.model, "on_epoch_end"): - self.model.on_epoch_end(self) - - if hasattr(self.criterion, "on_epoch_end"): - self.criterion.on_epoch_end(self) - - if hasattr(self.optimizer, "on_epoch_end"): - self.optimizer.on_epoch_end(self) - - def on_train_step_start(self) -> None: # pylint: disable=no-self-use - if hasattr(self.model, "on_train_step_start"): - self.model.on_train_step_start(self) - - if hasattr(self.criterion, "on_train_step_start"): - self.criterion.on_train_step_start(self) - - if hasattr(self.optimizer, "on_train_step_start"): - self.optimizer.on_train_step_start(self) - - def on_train_step_end(self) -> None: # pylint: disable=no-self-use - if hasattr(self.model, "on_train_step_end"): - self.model.on_train_step_end(self) - - if hasattr(self.criterion, "on_train_step_end"): - self.criterion.on_train_step_end(self) - - if hasattr(self.optimizer, "on_train_step_end"): - self.optimizer.on_train_step_end(self) + @abstractmethod + def on_train_step_end(self) -> None: + pass diff --git a/TTS/tts/trainer_tts.py b/TTS/tts/trainer_tts.py new file mode 100644 index 00000000..9d060498 --- /dev/null +++ b/TTS/tts/trainer_tts.py @@ -0,0 +1,709 @@ +# -*- coding: utf-8 -*- + +import importlib +import logging +import os +import time +from argparse import Namespace +from typing import Dict, List, Tuple, Union + +import torch +from coqpit import Coqpit + +# DISTRIBUTED +from torch import nn +from torch.nn.parallel import DistributedDataParallel as DDP_th +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler + +from TTS.trainer import TrainerAbstract +from TTS.tts.datasets import TTSDataset, load_meta_data +from TTS.tts.layers import setup_loss +from TTS.tts.models import setup_model +from TTS.tts.utils.io import save_best_model, save_checkpoint +from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager +from TTS.tts.utils.synthesis import synthesis +from TTS.tts.utils.text.symbols import make_symbols +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor +from TTS.utils.distribute import init_distributed +from TTS.utils.generic_utils import KeepAverage, count_parameters, set_init_dict, to_cuda +from TTS.utils.logging import ConsoleLogger, TensorboardLogger +from TTS.utils.training import check_update, setup_torch_training_env + + +# pylint: disable=import-outside-toplevel, too-many-public-methods + +class TrainerTTS(TrainerAbstract): + use_cuda, num_gpus = setup_torch_training_env(True, False) + + def __init__( + self, + args: Union[Coqpit, Namespace], + config: Coqpit, + c_logger: ConsoleLogger = None, + tb_logger: TensorboardLogger = None, + model: nn.Module = None, + output_path: str = None, + ) -> None: + self.args = args + self.config = config + self.c_logger = ConsoleLogger() if c_logger is None else c_logger + if tb_logger is None: + self.tb_logger = TensorboardLogger(output_path, model_name=config.model) + self.tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) + else: + self.tb_logger = tb_logger + self.output_path = output_path + + self.total_steps_done = 0 + self.epochs_done = 0 + self.restore_step = 0 + self.best_loss = float("inf") + self.train_loader = None + self.eval_loader = None + self.output_audio_path = os.path.join(output_path, "test_audios") + + self.keep_avg_train = None + self.keep_avg_eval = None + + log_file = os.path.join(self.output_path, f"trainer_{args.rank}_log.txt") + self._setup_logger_config(log_file) + + # model, audio processor, datasets, loss + # init audio processor + self.ap = AudioProcessor(**self.config.audio.to_dict()) + + # init character processor + self.model_characters = self.get_character_processor(self.config) + + # load dataset samples + self.data_train, self.data_eval = load_meta_data(self.config.datasets) + + # default speaker manager + self.speaker_manager = self.get_speaker_manager(self.config, args.restore_path, output_path, self.data_train) + + # init TTS model + if model is not None: + self.model = model + else: + self.model = self.get_model( + len(self.model_characters), + self.speaker_manager.num_speakers, + self.config, + self.speaker_manager.d_vector_dim if self.speaker_manager.d_vectors else None, + ) + + # setup criterion + self.criterion = self.get_criterion(self.config) + + # DISTRUBUTED + if self.num_gpus > 1: + init_distributed( + args.rank, + self.num_gpus, + args.group_id, + self.config.distributed_backend, + self.config.distributed_url, + ) + + if self.use_cuda: + self.model.cuda() + self.criterion.cuda() + + # scalers for mixed precision training + self.scaler = torch.cuda.amp.GradScaler() if self.config.mixed_precision and self.use_cuda else None + + # setup optimizer + self.optimizer = self.get_optimizer(self.model, self.config) + + if self.args.restore_path: + self.model, self.optimizer, self.scaler, self.restore_step = self.restore_model( + self.config, args.restore_path, self.model, self.optimizer, self.scaler + ) + + # setup scheduler + self.scheduler = self.get_scheduler(self.config, self.optimizer) + + # DISTRUBUTED + if self.num_gpus > 1: + self.model = DDP_th(self.model, device_ids=[args.rank], output_device=args.rank) + + # count model size + num_params = count_parameters(self.model) + print("\n > Model has {} parameters".format(num_params)) + + @staticmethod + def get_model(num_chars: int, num_speakers: int, config: Coqpit, d_vector_dim: int) -> nn.Module: + model = setup_model(num_chars, num_speakers, config, d_vector_dim) + return model + + @staticmethod + def get_optimizer(model: nn.Module, config: Coqpit) -> torch.optim.Optimizer: + optimizer_name = config.optimizer + optimizer_params = config.optimizer_params + if optimizer_name.lower() == "radam": + module = importlib.import_module("TTS.utils.radam") + optimizer = getattr(module, "RAdam") + else: + optimizer = getattr(torch.optim, optimizer_name) + return optimizer(model.parameters(), lr=config.lr, **optimizer_params) + + @staticmethod + def get_character_processor(config: Coqpit) -> str: + # setup custom characters if set in config file. + # TODO: implement CharacterProcessor + if config.characters is not None: + symbols, phonemes = make_symbols(**config.characters.to_dict()) + else: + from TTS.tts.utils.text.symbols import phonemes, symbols + model_characters = phonemes if config.use_phonemes else symbols + return model_characters + + @staticmethod + def get_speaker_manager( + config: Coqpit, restore_path: str = "", out_path: str = "", data_train: List = None + ) -> SpeakerManager: + speaker_manager = get_speaker_manager(config, restore_path, data_train, out_path) + return speaker_manager + + @staticmethod + def get_scheduler( + config: Coqpit, optimizer: torch.optim.Optimizer + ) -> torch.optim.lr_scheduler._LRScheduler: # pylint: disable=protected-access + lr_scheduler = config.lr_scheduler + lr_scheduler_params = config.lr_scheduler_params + if lr_scheduler is None: + return None + if lr_scheduler.lower() == "noamlr": + from TTS.utils.training import NoamLR + + scheduler = NoamLR + else: + scheduler = getattr(torch.optim, lr_scheduler) + return scheduler(optimizer, **lr_scheduler_params) + + @staticmethod + def get_criterion(config: Coqpit) -> nn.Module: + return setup_loss(config) + + def restore_model( + self, + config: Coqpit, + restore_path: str, + model: nn.Module, + optimizer: torch.optim.Optimizer, + scaler: torch.cuda.amp.GradScaler = None, + ) -> Tuple[nn.Module, torch.optim.Optimizer, torch.cuda.amp.GradScaler, int]: + print(" > Restoring from %s ..." % os.path.basename(restore_path)) + checkpoint = torch.load(restore_path) + try: + print(" > Restoring Model...") + model.load_state_dict(checkpoint["model"]) + print(" > Restoring Optimizer...") + optimizer.load_state_dict(checkpoint["optimizer"]) + if "scaler" in checkpoint and config.mixed_precision: + print(" > Restoring AMP Scaler...") + scaler.load_state_dict(checkpoint["scaler"]) + except (KeyError, RuntimeError): + print(" > Partial model initialization...") + model_dict = model.state_dict() + model_dict = set_init_dict(model_dict, checkpoint["model"], config) + model.load_state_dict(model_dict) + del model_dict + + for group in optimizer.param_groups: + group["lr"] = self.config.lr + print( + " > Model restored from step %d" % checkpoint["step"], + ) + restore_step = checkpoint["step"] + return model, optimizer, scaler, restore_step + + def _get_loader( + self, + r: int, + ap: AudioProcessor, + is_eval: bool, + data_items: List, + verbose: bool, + speaker_ids: Union[Dict, List], + d_vectors: Union[Dict, List], + ) -> DataLoader: + if is_eval and not self.config.run_eval: + loader = None + else: + dataset = TTSDataset( + outputs_per_step=r, + text_cleaner=self.config.text_cleaner, + compute_linear_spec=self.config.model.lower() == "tacotron", + meta_data=data_items, + ap=ap, + tp=self.config.characters, + add_blank=self.config["add_blank"], + batch_group_size=0 if is_eval else self.config.batch_group_size * self.config.batch_size, + min_seq_len=self.config.min_seq_len, + max_seq_len=self.config.max_seq_len, + phoneme_cache_path=self.config.phoneme_cache_path, + use_phonemes=self.config.use_phonemes, + phoneme_language=self.config.phoneme_language, + enable_eos_bos=self.config.enable_eos_bos_chars, + use_noise_augment=not is_eval, + verbose=verbose, + speaker_id_mapping=speaker_ids if self.config.use_speaker_embedding else None, + d_vector_mapping=d_vectors + if self.config.use_speaker_embedding and self.config.use_external_speaker_embedding_file + else None, + ) + + if self.config.use_phonemes and self.config.compute_input_seq_cache: + # precompute phonemes to have a better estimate of sequence lengths. + dataset.compute_input_seq(self.config.num_loader_workers) + dataset.sort_items() + + sampler = DistributedSampler(dataset) if self.num_gpus > 1 else None + loader = DataLoader( + dataset, + batch_size=self.config.eval_batch_size if is_eval else self.config.batch_size, + shuffle=False, + collate_fn=dataset.collate_fn, + drop_last=False, + sampler=sampler, + num_workers=self.config.num_val_loader_workers if is_eval else self.config.num_loader_workers, + pin_memory=False, + ) + return loader + + def get_train_dataloader( + self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_ids: Dict, d_vectors: Dict + ) -> DataLoader: + return self._get_loader(r, ap, False, data_items, verbose, speaker_ids, d_vectors) + + def get_eval_dataloder( + self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_ids: Dict, d_vectors: Dict + ) -> DataLoader: + return self._get_loader(r, ap, True, data_items, verbose, speaker_ids, d_vectors) + + def format_batch(self, batch: List) -> Dict: + # setup input batch + text_input = batch[0] + text_lengths = batch[1] + speaker_names = batch[2] + linear_input = batch[3] if self.config.model.lower() in ["tacotron"] else None + mel_input = batch[4] + mel_lengths = batch[5] + stop_targets = batch[6] + item_idx = batch[7] + d_vectors = batch[8] + speaker_ids = batch[9] + attn_mask = batch[10] + max_text_length = torch.max(text_lengths.float()) + max_spec_length = torch.max(mel_lengths.float()) + + # compute durations from attention masks + durations = None + if attn_mask is not None: + durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2]) + for idx, am in enumerate(attn_mask): + # compute raw durations + c_idxs = am[:, : text_lengths[idx], : mel_lengths[idx]].max(1)[1] + # c_idxs, counts = torch.unique_consecutive(c_idxs, return_counts=True) + c_idxs, counts = torch.unique(c_idxs, return_counts=True) + dur = torch.ones([text_lengths[idx]]).to(counts.dtype) + dur[c_idxs] = counts + # smooth the durations and set any 0 duration to 1 + # by cutting off from the largest duration indeces. + extra_frames = dur.sum() - mel_lengths[idx] + largest_idxs = torch.argsort(-dur)[:extra_frames] + dur[largest_idxs] -= 1 + assert ( + dur.sum() == mel_lengths[idx] + ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + durations[idx, : text_lengths[idx]] = dur + + # set stop targets view, we predict a single stop token per iteration. + stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // self.config.r, -1) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) + + # dispatch batch to GPU + if self.use_cuda: + text_input = to_cuda(text_input) + text_lengths = to_cuda(text_lengths) + mel_input = to_cuda(mel_input) + mel_lengths = to_cuda(mel_lengths) + linear_input = to_cuda(linear_input) if self.config.model.lower() in ["tacotron"] else None + stop_targets = to_cuda(stop_targets) + attn_mask = to_cuda(attn_mask) if attn_mask is not None else None + durations = to_cuda(durations) if attn_mask is not None else None + if speaker_ids is not None: + speaker_ids = to_cuda(speaker_ids) + if d_vectors is not None: + d_vectors = to_cuda(d_vectors) + + return { + "text_input": text_input, + "text_lengths": text_lengths, + "speaker_names": speaker_names, + "mel_input": mel_input, + "mel_lengths": mel_lengths, + "linear_input": linear_input, + "stop_targets": stop_targets, + "attn_mask": attn_mask, + "durations": durations, + "speaker_ids": speaker_ids, + "d_vectors": d_vectors, + "max_text_length": max_text_length, + "max_spec_length": max_spec_length, + "item_idx": item_idx, + } + + def _train_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: + if hasattr(self.model, "module"): + return self.model.module.train_step(batch, criterion) + return self.model.train_step(batch, criterion) + + def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_time: float) -> Tuple[Dict, Dict]: + self.on_train_step_start() + step_start_time = time.time() + + # format data + batch = self.format_batch(batch) + loader_time = time.time() - loader_start_time + + # zero-out optimizer + self.optimizer.zero_grad() + + with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): + outputs, loss_dict = self._train_step(batch, self.criterion) + + # check nan loss + if torch.isnan(loss_dict["loss"]).any(): + raise RuntimeError(f"Detected NaN loss at step {self.total_steps_done}.") + + # optimizer step + if self.config.mixed_precision: + # model optimizer step in mixed precision mode + self.scaler.scale(loss_dict["loss"]).backward() + self.scaler.unscale_(self.optimizer) + grad_norm, _ = check_update(self.model, self.config.grad_clip, ignore_stopnet=True) + self.scaler.step(self.optimizer) + self.scaler.update() + else: + # main model optimizer step + loss_dict["loss"].backward() + grad_norm, _ = check_update(self.model, self.config.grad_clip, ignore_stopnet=True) + self.optimizer.step() + + step_time = time.time() - step_start_time + + # setup lr + if self.config.lr_scheduler: + self.scheduler.step() + + # detach loss values + loss_dict_new = dict() + for key, value in loss_dict.items(): + if isinstance(value, (int, float)): + loss_dict_new[key] = value + else: + loss_dict_new[key] = value.item() + loss_dict = loss_dict_new + + # update avg stats + update_train_values = dict() + for key, value in loss_dict.items(): + update_train_values["avg_" + key] = value + update_train_values["avg_loader_time"] = loader_time + update_train_values["avg_step_time"] = step_time + self.keep_avg_train.update_values(update_train_values) + + # print training progress + current_lr = self.optimizer.param_groups[0]["lr"] + if self.total_steps_done % self.config.print_step == 0: + log_dict = { + "max_spec_length": [batch["max_spec_length"], 1], # value, precision + "max_text_length": [batch["max_text_length"], 1], + "step_time": [step_time, 4], + "loader_time": [loader_time, 2], + "current_lr": current_lr, + } + self.c_logger.print_train_step( + batch_n_steps, step, self.total_steps_done, log_dict, loss_dict, self.keep_avg_train.avg_values + ) + + if self.args.rank == 0: + # Plot Training Iter Stats + # reduce TB load + if self.total_steps_done % self.config.tb_plot_step == 0: + iter_stats = { + "lr": current_lr, + "grad_norm": grad_norm, + "step_time": step_time, + } + iter_stats.update(loss_dict) + self.tb_logger.tb_train_step_stats(self.total_steps_done, iter_stats) + + if self.total_steps_done % self.config.save_step == 0: + if self.config.checkpoint: + # save model + save_checkpoint( + self.model, + self.optimizer, + self.total_steps_done, + self.epochs_done, + self.config.r, + self.output_path, + model_loss=loss_dict["loss"], + characters=self.model_characters, + scaler=self.scaler.state_dict() if self.config.mixed_precision else None, + ) + # training visualizations + if hasattr(self.model, "module"): + figures, audios = self.model.module.train_log(self.ap, batch, outputs) + else: + figures, audios = self.model.train_log(self.ap, batch, outputs) + self.tb_logger.tb_train_figures(self.total_steps_done, figures) + self.tb_logger.tb_train_audios(self.total_steps_done, {"TrainAudio": audios}, self.ap.sample_rate) + self.total_steps_done += 1 + self.on_train_step_end() + return outputs, loss_dict + + def train_epoch(self) -> None: + self.model.train() + epoch_start_time = time.time() + if self.use_cuda: + batch_num_steps = int(len(self.train_loader.dataset) / (self.config.batch_size * self.num_gpus)) + else: + batch_num_steps = int(len(self.train_loader.dataset) / self.config.batch_size) + self.c_logger.print_train_start() + loader_start_time = time.time() + for cur_step, batch in enumerate(self.train_loader): + _, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time) + epoch_time = time.time() - epoch_start_time + # Plot self.epochs_done Stats + if self.args.rank == 0: + epoch_stats = {"epoch_time": epoch_time} + epoch_stats.update(self.keep_avg_train.avg_values) + self.tb_logger.tb_train_epoch_stats(self.total_steps_done, epoch_stats) + if self.config.tb_model_param_stats: + self.tb_logger.tb_model_weights(self.model, self.total_steps_done) + + def _eval_step(self, batch: Dict) -> Tuple[Dict, Dict]: + if hasattr(self.model, "module"): + return self.model.module.eval_step(batch, self.criterion) + return self.model.eval_step(batch, self.criterion) + + def eval_step(self, batch: Dict, step: int) -> Tuple[Dict, Dict]: + with torch.no_grad(): + step_start_time = time.time() + + with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): + outputs, loss_dict = self._eval_step(batch) + + step_time = time.time() - step_start_time + + # detach loss values + loss_dict_new = dict() + for key, value in loss_dict.items(): + if isinstance(value, (int, float)): + loss_dict_new[key] = value + else: + loss_dict_new[key] = value.item() + loss_dict = loss_dict_new + + # update avg stats + update_eval_values = dict() + for key, value in loss_dict.items(): + update_eval_values["avg_" + key] = value + update_eval_values["avg_step_time"] = step_time + self.keep_avg_eval.update_values(update_eval_values) + + if self.config.print_eval: + self.c_logger.print_eval_step(step, loss_dict, self.keep_avg_eval.avg_values) + return outputs, loss_dict + + def eval_epoch(self) -> None: + self.model.eval() + self.c_logger.print_eval_start() + loader_start_time = time.time() + batch = None + for cur_step, batch in enumerate(self.eval_loader): + # format data + batch = self.format_batch(batch) + loader_time = time.time() - loader_start_time + self.keep_avg_eval.update_values({"avg_loader_time": loader_time}) + outputs, _ = self.eval_step(batch, cur_step) + # Plot epoch stats and samples from the last batch. + if self.args.rank == 0: + if hasattr(self.model, "module"): + figures, eval_audios = self.model.module.eval_log(self.ap, batch, outputs) + else: + figures, eval_audios = self.model.eval_log(self.ap, batch, outputs) + self.tb_logger.tb_eval_figures(self.total_steps_done, figures) + self.tb_logger.tb_eval_audios(self.total_steps_done, {"EvalAudio": eval_audios}, self.ap.sample_rate) + + def test_run( + self, + ) -> None: + print(" | > Synthesizing test sentences.") + test_audios = {} + test_figures = {} + test_sentences = self.config.test_sentences + aux_inputs = self._get_aux_inputs() + for idx, sen in enumerate(test_sentences): + wav, alignment, model_outputs, _ = synthesis( + self.model, + sen, + self.config, + self.use_cuda, + self.ap, + speaker_id=aux_inputs["speaker_id"], + d_vector=aux_inputs["d_vector"], + style_wav=aux_inputs["style_wav"], + enable_eos_bos_chars=self.config.enable_eos_bos_chars, + use_griffin_lim=True, + do_trim_silence=False, + ).values() + + file_path = os.path.join(self.output_audio_path, str(self.total_steps_done)) + os.makedirs(file_path, exist_ok=True) + file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) + self.ap.save_wav(wav, file_path) + test_audios["{}-audio".format(idx)] = wav + test_figures["{}-prediction".format(idx)] = plot_spectrogram(model_outputs, self.ap, output_fig=False) + test_figures["{}-alignment".format(idx)] = plot_alignment(alignment, output_fig=False) + + self.tb_logger.tb_test_audios(self.total_steps_done, test_audios, self.config.audio["sample_rate"]) + self.tb_logger.tb_test_figures(self.total_steps_done, test_figures) + + def _get_aux_inputs(self) -> Dict: + # setup speaker_id + speaker_id = 0 if self.config.use_speaker_embedding else None + # setup d_vector + d_vector = ( + self.speaker_manager.get_d_vectors_by_speaker(self.speaker_manager.speaker_names[0]) + if self.config.use_external_speaker_embedding_file and self.config.use_speaker_embedding + else None + ) + # setup style_mel + if self.config.has("gst_style_input"): + style_wav = self.config.gst_style_input + else: + style_wav = None + if style_wav is None and "use_gst" in self.config and self.config.use_gst: + # inicialize GST with zero dict. + style_wav = {} + print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") + for i in range(self.config.gst["gst_num_style_tokens"]): + style_wav[str(i)] = 0 + aux_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector} + return aux_inputs + + def fit(self) -> None: + if self.restore_step != 0 or self.args.best_path: + print(" > Restoring best loss from " f"{os.path.basename(self.args.best_path)} ...") + self.best_loss = torch.load(self.args.best_path, map_location="cpu")["model_loss"] + print(f" > Starting with loaded last best loss {self.best_loss}.") + + # define data loaders + self.train_loader = self.get_train_dataloader( + self.config.r, + self.ap, + self.data_train, + verbose=True, + speaker_ids=self.speaker_manager.speaker_ids, + d_vectors=self.speaker_manager.d_vectors, + ) + self.eval_loader = ( + self.get_eval_dataloder( + self.config.r, + self.ap, + self.data_train, + verbose=True, + speaker_ids=self.speaker_manager.speaker_ids, + d_vectors=self.speaker_manager.d_vectors, + ) + if self.config.run_eval + else None + ) + + self.total_steps_done = self.restore_step + + for epoch in range(0, self.config.epochs): + self.on_epoch_start() + self.keep_avg_train = KeepAverage() + self.keep_avg_eval = KeepAverage() if self.config.run_eval else None + self.epochs_done = epoch + self.c_logger.print_epoch_start(epoch, self.config.epochs) + self.train_epoch() + if self.config.run_eval: + self.eval_epoch() + if epoch >= self.config.test_delay_epochs and self.args.rank < 0: + self.test_run() + self.c_logger.print_epoch_end( + epoch, self.keep_avg_eval.avg_values if self.config.run_eval else self.keep_avg_train.avg_values + ) + self.save_best_model() + self.on_epoch_end() + + def save_best_model(self) -> None: + self.best_loss = save_best_model( + self.keep_avg_eval["avg_loss"] if self.keep_avg_eval else self.keep_avg_train["avg_loss"], + self.best_loss, + self.model, + self.optimizer, + self.total_steps_done, + self.epochs_done, + self.config.r, + self.output_path, + self.model_characters, + keep_all_best=self.config.keep_all_best, + keep_after=self.config.keep_after, + scaler=self.scaler.state_dict() if self.config.mixed_precision else None, + ) + + @staticmethod + def _setup_logger_config(log_file: str) -> None: + logging.basicConfig( + level=logging.INFO, format="", handlers=[logging.FileHandler(log_file), logging.StreamHandler()] + ) + + def on_epoch_start(self) -> None: # pylint: disable=no-self-use + if hasattr(self.model, "on_epoch_start"): + self.model.on_epoch_start(self) + + if hasattr(self.criterion, "on_epoch_start"): + self.criterion.on_epoch_start(self) + + if hasattr(self.optimizer, "on_epoch_start"): + self.optimizer.on_epoch_start(self) + + def on_epoch_end(self) -> None: # pylint: disable=no-self-use + if hasattr(self.model, "on_epoch_end"): + self.model.on_epoch_end(self) + + if hasattr(self.criterion, "on_epoch_end"): + self.criterion.on_epoch_end(self) + + if hasattr(self.optimizer, "on_epoch_end"): + self.optimizer.on_epoch_end(self) + + def on_train_step_start(self) -> None: # pylint: disable=no-self-use + if hasattr(self.model, "on_train_step_start"): + self.model.on_train_step_start(self) + + if hasattr(self.criterion, "on_train_step_start"): + self.criterion.on_train_step_start(self) + + if hasattr(self.optimizer, "on_train_step_start"): + self.optimizer.on_train_step_start(self) + + def on_train_step_end(self) -> None: # pylint: disable=no-self-use + if hasattr(self.model, "on_train_step_end"): + self.model.on_train_step_end(self) + + if hasattr(self.criterion, "on_train_step_end"): + self.criterion.on_train_step_end(self) + + if hasattr(self.optimizer, "on_train_step_end"): + self.optimizer.on_train_step_end(self) From d4dbd897521b6a62de15a3b59ffba209631d5c8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 8 Jun 2021 17:34:19 +0200 Subject: [PATCH 190/258] fix calculation of `loader_start_time` --- TTS/tts/trainer_tts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/trainer_tts.py b/TTS/tts/trainer_tts.py index 9d060498..6c900120 100644 --- a/TTS/tts/trainer_tts.py +++ b/TTS/tts/trainer_tts.py @@ -476,8 +476,8 @@ class TrainerTTS(TrainerAbstract): else: batch_num_steps = int(len(self.train_loader.dataset) / self.config.batch_size) self.c_logger.print_train_start() - loader_start_time = time.time() for cur_step, batch in enumerate(self.train_loader): + loader_start_time = time.time() _, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time) epoch_time = time.time() - epoch_start_time # Plot self.epochs_done Stats From 6d7b5fbcde3edd508141a696e2d1e75a8e77a514 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 7 Jun 2021 19:22:44 +0200 Subject: [PATCH 191/258] `tts` model abstraction with `TTSModel` --- TTS/tts/models/abstract_tts.py | 134 ++++++++++++++++++++++++++++++++ TTS/tts/models/align_tts.py | 3 +- TTS/tts/models/glow_tts.py | 3 +- TTS/tts/models/speedy_speech.py | 3 +- 4 files changed, 140 insertions(+), 3 deletions(-) create mode 100644 TTS/tts/models/abstract_tts.py diff --git a/TTS/tts/models/abstract_tts.py b/TTS/tts/models/abstract_tts.py new file mode 100644 index 00000000..9132f7eb --- /dev/null +++ b/TTS/tts/models/abstract_tts.py @@ -0,0 +1,134 @@ +from coqpit import Coqpit +from abc import ABC, abstractmethod +from typing import Dict, Tuple + +import numpy as np +import torch +from torch import nn + +from TTS.utils.audio import AudioProcessor + +# pylint: skip-file + + +class TTSModel(nn.Module, ABC): + """Abstract TTS class. Every new `tts` model must inherit this. + + Notes on input/output tensor shapes: + Any input or output tensor of the model must be shaped as + + - 3D tensors `batch x time x channels` + - 2D tensors `batch x channels` + - 1D tensors `batch x 1` + """ + + @abstractmethod + def forward(self, text: torch.Tensor, aux_input={}, **kwargs) -> Dict: + """Forward pass for the model mainly used in training. + + You can be flexible here and use different number of arguments and argument names since it is mostly used by + `train_step()` in training whitout exposing it to the out of the class. + + Args: + text (torch.Tensor): Input text character sequence ids. + aux_input (Dict): Auxiliary model inputs like embeddings, durations or any other sorts of inputs. + for the model. + + Returns: + Dict: model outputs. This must include an item keyed `model_outputs` as the final artifact of the model. + """ + outputs_dict = {"model_outputs": None} + ... + return outputs_dict + + @abstractmethod + def inference(self, text: torch.Tensor, aux_input={}) -> Dict: + """Forward pass for inference. + + After the model is trained this is the only function that connects the model the out world. + + This function must only take a `text` input and a dictionary that has all the other model specific inputs. + We don't use `*kwargs` since it is problematic with the TorchScript API. + + Args: + text (torch.Tensor): [description] + aux_input (Dict): Auxiliary inputs like speaker embeddings, durations etc. + + Returns: + Dict: [description] + """ + outputs_dict = {"model_outputs": None} + ... + return outputs_dict + + @abstractmethod + def train_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: + """Perform a single training step. Run the model forward pass and compute losses. + + Args: + batch (Dict): Input tensors. + criterion (nn.Module): Loss layer designed for the model. + + Returns: + Tuple[Dict, Dict]: Model ouputs and computed losses. + """ + outputs_dict = {} + loss_dict = {} # this returns from the criterion + ... + return outputs_dict, loss_dict + + @abstractmethod + def train_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + """Create visualizations and waveform examples for training. + + For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to + be projected onto Tensorboard. + + Args: + ap (AudioProcessor): audio processor used at training. + batch (Dict): Model inputs used at the previous training step. + outputs (Dict): Model outputs generated at the previoud training step. + + Returns: + Tuple[Dict, np.ndarray]: training plots and output waveform. + """ + figures_dict = {} + output_wav = np.array() + ... + return figures_dict, output_wav + + @abstractmethod + def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: + """Perform a single evaluation step. Run the model forward pass and compute losses. In most cases, you can + call `train_step()` with no changes. + + Args: + batch (Dict): Input tensors. + criterion (nn.Module): Loss layer designed for the model. + + Returns: + Tuple[Dict, Dict]: Model ouputs and computed losses. + """ + outputs_dict = {} + loss_dict = {} # this returns from the criterion + ... + return outputs_dict, loss_dict + + @abstractmethod + def eval_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + """The same as `train_log()`""" + figures_dict = {} + output_wav = np.array() + ... + return figures_dict, output_wav + + @abstractmethod + def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False) -> None: + """Load a checkpoint and get ready for training or inference. + + Args: + config (Coqpit): Model configuration. + checkpoint_path (str): Path to the model checkpoint file. + eval (bool, optional): If true, init model for inference else for training. Defaults to False. + """ + ... diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 6c268a43..75fb50de 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -7,13 +7,14 @@ from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path +from TTS.tts.models.abstract_tts import TTSModel from TTS.tts.utils.data import sequence_mask from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor -class AlignTTS(nn.Module): +class AlignTTS(TTSModel): """AlignTTS with modified duration predictor. https://arxiv.org/pdf/2003.01950.pdf diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index e61b80c2..a30eadb4 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -7,13 +7,14 @@ from torch.nn import functional as F from TTS.tts.layers.glow_tts.decoder import Decoder from TTS.tts.layers.glow_tts.encoder import Encoder from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path +from TTS.tts.models.abstract_tts import TTSModel from TTS.tts.utils.data import sequence_mask from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor -class GlowTTS(nn.Module): +class GlowTTS(TTSModel): """Glow TTS models from https://arxiv.org/abs/2005.11129 Args: diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index d4a90a2e..44a47722 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -6,13 +6,14 @@ from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path +from TTS.tts.models.abstract_tts import TTSModel from TTS.tts.utils.data import sequence_mask from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor -class SpeedySpeech(nn.Module): +class SpeedySpeech(TTSModel): """Speedy Speech model https://arxiv.org/abs/2008.03802 From c7aad884cdd5b99f620390e0c3af58cdbd710418 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:22:05 +0200 Subject: [PATCH 192/258] Implement unified trainer --- TTS/bin/train_encoder.py | 2 +- TTS/bin/train_tts.py | 24 +- TTS/bin/train_vocoder.py | 27 + TTS/bin/train_vocoder_gan.py | 638 ------------------ TTS/bin/train_vocoder_wavegrad.py | 431 ------------ TTS/bin/train_vocoder_wavernn.py | 431 ------------ TTS/trainer.py | 999 ++++++++++++++++++++++++++-- TTS/tts/models/tacotron_abstract.py | 245 ------- TTS/tts/trainer_tts.py | 709 -------------------- TTS/utils/arguments.py | 182 ----- TTS/utils/callbacks.py | 75 +++ TTS/utils/distribute.py | 45 -- TTS/utils/trainer_utils.py | 65 ++ TTS/utils/training.py | 79 +-- 14 files changed, 1128 insertions(+), 2824 deletions(-) create mode 100644 TTS/bin/train_vocoder.py delete mode 100755 TTS/bin/train_vocoder_gan.py delete mode 100644 TTS/bin/train_vocoder_wavegrad.py delete mode 100644 TTS/bin/train_vocoder_wavernn.py delete mode 100644 TTS/tts/models/tacotron_abstract.py delete mode 100644 TTS/tts/trainer_tts.py delete mode 100644 TTS/utils/arguments.py create mode 100644 TTS/utils/callbacks.py create mode 100644 TTS/utils/trainer_utils.py diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 6e4a9b32..38902a18 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -13,8 +13,8 @@ from TTS.speaker_encoder.dataset import SpeakerEncoderDataset from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_model from TTS.speaker_encoder.utils.visual import plot_embeddings +from TTS.trainer import init_training from TTS.tts.datasets import load_meta_data -from TTS.utils.arguments import init_training from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import count_parameters, remove_experiment_folder, set_init_dict from TTS.utils.radam import RAdam diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 06765906..c491700d 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -1,27 +1,13 @@ -import os import sys -import traceback -from TTS.tts.trainer_tts import TrainerTTS -from TTS.utils.arguments import init_training -from TTS.utils.generic_utils import remove_experiment_folder +from TTS.trainer import Trainer, init_training def main(): - try: - args, config, output_path, _, c_logger, tb_logger = init_training(sys.argv) - trainer = TrainerTTS(args, config, c_logger, tb_logger, output_path=output_path) - trainer.fit() - except KeyboardInterrupt: - remove_experiment_folder(output_path) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(output_path) - traceback.print_exc() - sys.exit(1) + """Run 🐸TTS trainer from terminal. This is also necessary to run DDP training by ```distribute.py```""" + args, config, output_path, _, c_logger, tb_logger = init_training(sys.argv) + trainer = Trainer(args, config, output_path, c_logger, tb_logger, cudnn_benchmark=False) + trainer.fit() if __name__ == "__main__": diff --git a/TTS/bin/train_vocoder.py b/TTS/bin/train_vocoder.py new file mode 100644 index 00000000..868aae2e --- /dev/null +++ b/TTS/bin/train_vocoder.py @@ -0,0 +1,27 @@ +import os +import sys +import traceback + +from TTS.trainer import Trainer, init_training +from TTS.utils.generic_utils import remove_experiment_folder + + +def main(): + try: + args, config, output_path, _, c_logger, tb_logger = init_training(sys.argv) + trainer = Trainer(args, config, output_path, c_logger, tb_logger) + trainer.fit() + except KeyboardInterrupt: + remove_experiment_folder(output_path) + try: + sys.exit(0) + except SystemExit: + os._exit(0) # pylint: disable=protected-access + except Exception: # pylint: disable=broad-except + remove_experiment_folder(output_path) + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/TTS/bin/train_vocoder_gan.py b/TTS/bin/train_vocoder_gan.py deleted file mode 100755 index ea317ef6..00000000 --- a/TTS/bin/train_vocoder_gan.py +++ /dev/null @@ -1,638 +0,0 @@ -#!/usr/bin/env python3 -# TODO: mixed precision training -"""Trains GAN based vocoder model.""" - -import itertools -import os -import sys -import time -import traceback -from inspect import signature - -import torch - -# DISTRIBUTED -from torch.nn.parallel import DistributedDataParallel as DDP_th -from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler - -from TTS.utils.arguments import init_training -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import init_distributed -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.training import setup_torch_training_env -from TTS.vocoder.datasets.gan_dataset import GANDataset -from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data -from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss -from TTS.vocoder.utils.generic_utils import plot_results, setup_discriminator, setup_generator -from TTS.vocoder.utils.io import save_best_model, save_checkpoint - -use_cuda, num_gpus = setup_torch_training_env(True, True) - - -def setup_loader(ap, is_val=False, verbose=False): - loader = None - if not is_val or c.run_eval: - dataset = GANDataset( - ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad_short=c.pad_short, - conv_pad=c.conv_pad, - return_pairs=c.diff_samples_for_G_and_D if "diff_samples_for_G_and_D" in c else False, - is_training=not is_val, - return_segments=not is_val, - use_noise_augment=c.use_noise_augment, - use_cache=c.use_cache, - verbose=verbose, - ) - dataset.shuffle_mapping() - sampler = DistributedSampler(dataset, shuffle=True) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=1 if is_val else c.batch_size, - shuffle=num_gpus == 0, - drop_last=False, - sampler=sampler, - num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers, - pin_memory=False, - ) - return loader - - -def format_data(data): - if isinstance(data[0], list): - x_G, y_G = data[0] - x_D, y_D = data[1] - if use_cuda: - x_G = x_G.cuda(non_blocking=True) - y_G = y_G.cuda(non_blocking=True) - x_D = x_D.cuda(non_blocking=True) - y_D = y_D.cuda(non_blocking=True) - return x_G, y_G, x_D, y_D - x, y = data - if use_cuda: - x = x.cuda(non_blocking=True) - y = y.cuda(non_blocking=True) - return x, y, None, None - - -def train( - model_G, - criterion_G, - optimizer_G, - model_D, - criterion_D, - optimizer_D, - scheduler_G, - scheduler_D, - ap, - global_step, - epoch, -): - data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) - model_G.train() - model_D.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / c.batch_size) - end_time = time.time() - c_logger.print_train_start() - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - c_G, y_G, c_D, y_D = format_data(data) - loader_time = time.time() - end_time - - global_step += 1 - - ############################## - # GENERATOR - ############################## - - # generator pass - y_hat = model_G(c_G) - y_hat_sub = None - y_G_sub = None - y_hat_vis = y_hat # for visualization - - # PQMF formatting - if y_hat.shape[1] > 1: - y_hat_sub = y_hat - y_hat = model_G.pqmf_synthesis(y_hat) - y_hat_vis = y_hat - y_G_sub = model_G.pqmf_analysis(y_G) - - scores_fake, feats_fake, feats_real = None, None, None - if global_step > c.steps_to_start_discriminator: - - # run D with or without cond. features - if len(signature(model_D.forward).parameters) == 2: - D_out_fake = model_D(y_hat, c_G) - else: - D_out_fake = model_D(y_hat) - D_out_real = None - - if c.use_feat_match_loss: - with torch.no_grad(): - D_out_real = model_D(y_G) - - # format D outputs - if isinstance(D_out_fake, tuple): - scores_fake, feats_fake = D_out_fake - if D_out_real is None: - feats_real = None - else: - # we don't need scores for real samples for training G since they are always 1 - _, feats_real = D_out_real - else: - scores_fake = D_out_fake - - # compute losses - loss_G_dict = criterion_G( - y_hat=y_hat, - y=y_G, - scores_fake=scores_fake, - feats_fake=feats_fake, - feats_real=feats_real, - y_hat_sub=y_hat_sub, - y_sub=y_G_sub, - ) - loss_G = loss_G_dict["G_loss"] - - # optimizer generator - optimizer_G.zero_grad() - loss_G.backward() - if c.gen_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_G.parameters(), c.gen_clip_grad) - optimizer_G.step() - - loss_dict = dict() - for key, value in loss_G_dict.items(): - if isinstance(value, int): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - ############################## - # DISCRIMINATOR - ############################## - if global_step >= c.steps_to_start_discriminator: - # discriminator pass - if c.diff_samples_for_G_and_D: - # use a different sample than generator - with torch.no_grad(): - y_hat = model_G(c_D) - - # PQMF formatting - if y_hat.shape[1] > 1: - y_hat = model_G.pqmf_synthesis(y_hat) - else: - # use the same samples as generator - c_D = c_G.clone() - y_D = y_G.clone() - - # run D with or without cond. features - if len(signature(model_D.forward).parameters) == 2: - D_out_fake = model_D(y_hat.detach().clone(), c_D) - D_out_real = model_D(y_D, c_D) - else: - D_out_fake = model_D(y_hat.detach()) - D_out_real = model_D(y_D) - - # format D outputs - if isinstance(D_out_fake, tuple): - # model_D returns scores and features - scores_fake, feats_fake = D_out_fake - if D_out_real is None: - scores_real, feats_real = None, None - else: - scores_real, feats_real = D_out_real - else: - # model D returns only scores - scores_fake = D_out_fake - scores_real = D_out_real - - # compute losses - loss_D_dict = criterion_D(scores_fake, scores_real) - loss_D = loss_D_dict["D_loss"] - - # optimizer discriminator - optimizer_D.zero_grad() - loss_D.backward() - if c.disc_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_D.parameters(), c.disc_clip_grad) - optimizer_D.step() - - for key, value in loss_D_dict.items(): - if isinstance(value, (int, float)): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - step_time = time.time() - start_time - epoch_time += step_time - - # get current learning rates - current_lr_G = list(optimizer_G.param_groups)[0]["lr"] - current_lr_D = list(optimizer_D.param_groups)[0]["lr"] - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - keep_avg.update_values(update_train_values) - - # print training stats - if global_step % c.print_step == 0: - log_dict = { - "step_time": [step_time, 2], - "loader_time": [loader_time, 4], - "current_lr_G": current_lr_G, - "current_lr_D": current_lr_D, - } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # plot step stats - if global_step % 10 == 0: - iter_stats = {"lr_G": current_lr_G, "lr_D": current_lr_D, "step_time": step_time} - iter_stats.update(loss_dict) - tb_logger.tb_train_step_stats(global_step, iter_stats) - - # save checkpoint - if global_step % c.save_step == 0: - if c.checkpoint: - # save model - save_checkpoint( - model_G, - optimizer_G, - scheduler_G, - model_D, - optimizer_D, - scheduler_D, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict, - ) - - # compute spectrograms - figures = plot_results(y_hat_vis, y_G, ap, global_step, "train") - tb_logger.tb_train_figures(global_step, figures) - - # Sample audio - sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_train_audios(global_step, {"train/audio": sample_voice}, c.audio["sample_rate"]) - end_time = time.time() - - if scheduler_G is not None: - scheduler_G.step() - - if scheduler_D is not None: - scheduler_D.step() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Training Epoch Stats - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - if args.rank == 0: - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - # TODO: plot model stats - # if c.tb_model_param_stats: - # tb_logger.tb_model_weights(model, global_step) - torch.cuda.empty_cache() - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch): - data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0)) - model_G.eval() - model_D.eval() - epoch_time = 0 - keep_avg = KeepAverage() - end_time = time.time() - c_logger.print_eval_start() - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - c_G, y_G, _, _ = format_data(data) - loader_time = time.time() - end_time - - global_step += 1 - - ############################## - # GENERATOR - ############################## - - # generator pass - y_hat = model_G(c_G)[:, :, : y_G.size(2)] - y_hat_sub = None - y_G_sub = None - - # PQMF formatting - if y_hat.shape[1] > 1: - y_hat_sub = y_hat - y_hat = model_G.pqmf_synthesis(y_hat) - y_G_sub = model_G.pqmf_analysis(y_G) - - scores_fake, feats_fake, feats_real = None, None, None - if global_step > c.steps_to_start_discriminator: - - if len(signature(model_D.forward).parameters) == 2: - D_out_fake = model_D(y_hat, c_G) - else: - D_out_fake = model_D(y_hat) - D_out_real = None - - if c.use_feat_match_loss: - with torch.no_grad(): - D_out_real = model_D(y_G) - - # format D outputs - if isinstance(D_out_fake, tuple): - scores_fake, feats_fake = D_out_fake - if D_out_real is None: - feats_real = None - else: - _, feats_real = D_out_real - else: - scores_fake = D_out_fake - feats_fake, feats_real = None, None - - # compute losses - loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub) - - loss_dict = dict() - for key, value in loss_G_dict.items(): - if isinstance(value, (int, float)): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - ############################## - # DISCRIMINATOR - ############################## - - if global_step >= c.steps_to_start_discriminator: - # discriminator pass - with torch.no_grad(): - y_hat = model_G(c_G)[:, :, : y_G.size(2)] - - # PQMF formatting - if y_hat.shape[1] > 1: - y_hat = model_G.pqmf_synthesis(y_hat) - - # run D with or without cond. features - if len(signature(model_D.forward).parameters) == 2: - D_out_fake = model_D(y_hat.detach(), c_G) - D_out_real = model_D(y_G, c_G) - else: - D_out_fake = model_D(y_hat.detach()) - D_out_real = model_D(y_G) - - # format D outputs - if isinstance(D_out_fake, tuple): - scores_fake, feats_fake = D_out_fake - if D_out_real is None: - scores_real, feats_real = None, None - else: - scores_real, feats_real = D_out_real - else: - scores_fake = D_out_fake - scores_real = D_out_real - - # compute losses - loss_D_dict = criterion_D(scores_fake, scores_real) - - for key, value in loss_D_dict.items(): - if isinstance(value, (int, float)): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - step_time = time.time() - start_time - epoch_time += step_time - - # update avg stats - update_eval_values = dict() - for key, value in loss_dict.items(): - update_eval_values["avg_" + key] = value - update_eval_values["avg_loader_time"] = loader_time - update_eval_values["avg_step_time"] = step_time - keep_avg.update_values(update_eval_values) - - # print eval stats - if c.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # compute spectrograms - figures = plot_results(y_hat, y_G, ap, global_step, "eval") - tb_logger.tb_eval_figures(global_step, figures) - - # Sample audio - predict_waveform = y_hat[0].squeeze(0).detach().cpu().numpy() - real_waveform = y_G[0].squeeze(0).cpu().numpy() - tb_logger.tb_eval_audios( - global_step, {"eval/audio": predict_waveform, "eval/real_waveformo": real_waveform}, c.audio["sample_rate"] - ) - - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - - # synthesize a full voice - data_loader.return_segments = False - torch.cuda.empty_cache() - return keep_avg.avg_values - - -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global train_data, eval_data - print(f" > Loading wavs from: {c.data_path}") - if c.feature_path is not None: - print(f" > Loading features from: {c.feature_path}") - eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) - else: - eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) - - # setup audio processor - ap = AudioProcessor(**c.audio.to_dict()) - - # DISTRUBUTED - if num_gpus > 1: - init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) - - # setup models - model_gen = setup_generator(c) - model_disc = setup_discriminator(c) - - # setup criterion - criterion_gen = GeneratorLoss(c) - criterion_disc = DiscriminatorLoss(c) - - if use_cuda: - model_gen.cuda() - criterion_gen.cuda() - model_disc.cuda() - criterion_disc.cuda() - - # setup optimizers - # TODO: allow loading custom optimizers - optimizer_gen = None - optimizer_disc = None - optimizer_gen = getattr(torch.optim, c.optimizer) - optimizer_gen = optimizer_gen(model_gen.parameters(), lr=c.lr_gen, **c.optimizer_params) - optimizer_disc = getattr(torch.optim, c.optimizer) - - if c.discriminator_model == "hifigan_discriminator": - optimizer_disc = optimizer_disc( - itertools.chain(model_disc.msd.parameters(), model_disc.mpd.parameters()), - lr=c.lr_disc, - **c.optimizer_params, - ) - else: - optimizer_disc = optimizer_disc(model_disc.parameters(), lr=c.lr_disc, **c.optimizer_params) - - # schedulers - scheduler_gen = None - scheduler_disc = None - if "lr_scheduler_gen" in c: - scheduler_gen = getattr(torch.optim.lr_scheduler, c.lr_scheduler_gen) - scheduler_gen = scheduler_gen(optimizer_gen, **c.lr_scheduler_gen_params) - if "lr_scheduler_disc" in c: - scheduler_disc = getattr(torch.optim.lr_scheduler, c.lr_scheduler_disc) - scheduler_disc = scheduler_disc(optimizer_disc, **c.lr_scheduler_disc_params) - - if args.restore_path: - print(f" > Restoring from {os.path.basename(args.restore_path)}...") - checkpoint = torch.load(args.restore_path, map_location="cpu") - try: - print(" > Restoring Generator Model...") - model_gen.load_state_dict(checkpoint["model"]) - print(" > Restoring Generator Optimizer...") - optimizer_gen.load_state_dict(checkpoint["optimizer"]) - print(" > Restoring Discriminator Model...") - model_disc.load_state_dict(checkpoint["model_disc"]) - print(" > Restoring Discriminator Optimizer...") - optimizer_disc.load_state_dict(checkpoint["optimizer_disc"]) - # restore schedulers if it is a continuing training. - if args.continue_path != "": - if "scheduler" in checkpoint and scheduler_gen is not None: - print(" > Restoring Generator LR Scheduler...") - scheduler_gen.load_state_dict(checkpoint["scheduler"]) - # NOTE: Not sure if necessary - scheduler_gen.optimizer = optimizer_gen - if "scheduler_disc" in checkpoint and scheduler_disc is not None: - print(" > Restoring Discriminator LR Scheduler...") - scheduler_disc.load_state_dict(checkpoint["scheduler_disc"]) - scheduler_disc.optimizer = optimizer_disc - if c.lr_scheduler_disc == "ExponentialLR": - scheduler_disc.last_epoch = checkpoint["epoch"] - except RuntimeError: - # restore only matching layers. - print(" > Partial model initialization...") - model_dict = model_gen.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], c) - model_gen.load_state_dict(model_dict) - - model_dict = model_disc.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model_disc"], c) - model_disc.load_state_dict(model_dict) - del model_dict - - # reset lr if not countinuining training. - if args.continue_path == "": - for group in optimizer_gen.param_groups: - group["lr"] = c.lr_gen - - for group in optimizer_disc.param_groups: - group["lr"] = c.lr_disc - - print(f" > Model restored from step {checkpoint['step']:d}", flush=True) - args.restore_step = checkpoint["step"] - else: - args.restore_step = 0 - - # DISTRUBUTED - if num_gpus > 1: - model_gen = DDP_th(model_gen, device_ids=[args.rank]) - model_disc = DDP_th(model_disc, device_ids=[args.rank]) - - num_params = count_parameters(model_gen) - print(" > Generator has {} parameters".format(num_params), flush=True) - num_params = count_parameters(model_disc) - print(" > Discriminator has {} parameters".format(num_params), flush=True) - - if args.restore_step == 0 or not args.best_path: - best_loss = float("inf") - print(" > Starting with inf best loss.") - else: - print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with best loss of {best_loss}.") - keep_all_best = c.get("keep_all_best", False) - keep_after = c.get("keep_after", 10000) # void if keep_all_best False - - global_step = args.restore_step - for epoch in range(0, c.epochs): - c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train( - model_gen, - criterion_gen, - optimizer_gen, - model_disc, - criterion_disc, - optimizer_disc, - scheduler_gen, - scheduler_disc, - ap, - global_step, - epoch, - ) - eval_avg_loss_dict = evaluate(model_gen, criterion_gen, model_disc, criterion_disc, ap, global_step, epoch) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = eval_avg_loss_dict[c.target_loss] - best_loss = save_best_model( - target_loss, - best_loss, - model_gen, - optimizer_gen, - scheduler_gen, - model_disc, - optimizer_disc, - scheduler_disc, - global_step, - epoch, - OUT_PATH, - keep_all_best=keep_all_best, - keep_after=keep_after, - model_losses=eval_avg_loss_dict, - ) - - -if __name__ == "__main__": - args, c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/TTS/bin/train_vocoder_wavegrad.py b/TTS/bin/train_vocoder_wavegrad.py deleted file mode 100644 index c8f067ee..00000000 --- a/TTS/bin/train_vocoder_wavegrad.py +++ /dev/null @@ -1,431 +0,0 @@ -#!/usr/bin/env python3 -"""Trains WaveGrad vocoder models.""" - -import os -import sys -import time -import traceback - -import numpy as np -import torch - -# DISTRIBUTED -from torch.nn.parallel import DistributedDataParallel as DDP_th -from torch.optim import Adam -from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler - -from TTS.utils.arguments import init_training -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import init_distributed -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.training import setup_torch_training_env -from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data -from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset -from TTS.vocoder.utils.generic_utils import plot_results, setup_generator -from TTS.vocoder.utils.io import save_best_model, save_checkpoint - -use_cuda, num_gpus = setup_torch_training_env(True, True) - - -def setup_loader(ap, is_val=False, verbose=False): - if is_val and not c.run_eval: - loader = None - else: - dataset = WaveGradDataset( - ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad_short=c.pad_short, - conv_pad=c.conv_pad, - is_training=not is_val, - return_segments=True, - use_noise_augment=False, - use_cache=c.use_cache, - verbose=verbose, - ) - sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=c.batch_size, - shuffle=num_gpus <= 1, - drop_last=False, - sampler=sampler, - num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers, - pin_memory=False, - ) - - return loader - - -def format_data(data): - # return a whole audio segment - m, x = data - x = x.unsqueeze(1) - if use_cuda: - m = m.cuda(non_blocking=True) - x = x.cuda(non_blocking=True) - return m, x - - -def format_test_data(data): - # return a whole audio segment - m, x = data - m = m[None, ...] - x = x[None, None, ...] - if use_cuda: - m = m.cuda(non_blocking=True) - x = x.cuda(non_blocking=True) - return m, x - - -def train(model, criterion, optimizer, scheduler, scaler, ap, global_step, epoch): - data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) - model.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / c.batch_size) - end_time = time.time() - c_logger.print_train_start() - # setup noise schedule - noise_schedule = c["train_noise_schedule"] - betas = np.linspace(noise_schedule["min_val"], noise_schedule["max_val"], noise_schedule["num_steps"]) - if hasattr(model, "module"): - model.module.compute_noise_level(betas) - else: - model.compute_noise_level(betas) - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - m, x = format_data(data) - loader_time = time.time() - end_time - - global_step += 1 - - with torch.cuda.amp.autocast(enabled=c.mixed_precision): - # compute noisy input - if hasattr(model, "module"): - noise, x_noisy, noise_scale = model.module.compute_y_n(x) - else: - noise, x_noisy, noise_scale = model.compute_y_n(x) - - # forward pass - noise_hat = model(x_noisy, m, noise_scale) - - # compute losses - loss = criterion(noise, noise_hat) - loss_wavegrad_dict = {"wavegrad_loss": loss} - - # check nan loss - if torch.isnan(loss).any(): - raise RuntimeError(f"Detected NaN loss at step {global_step}.") - - optimizer.zero_grad() - - # backward pass with loss scaling - if c.mixed_precision: - scaler.scale(loss).backward() - scaler.unscale_(optimizer) - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), c.grad_clip) - scaler.step(optimizer) - scaler.update() - else: - loss.backward() - grad_norm = torch.nn.utils.grad_clip_norm_(model.parameters(), c.clip_grad) - optimizer.step() - - # schedule update - if scheduler is not None: - scheduler.step() - - # disconnect loss values - loss_dict = dict() - for key, value in loss_wavegrad_dict.items(): - if isinstance(value, int): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - # epoch/step timing - step_time = time.time() - start_time - epoch_time += step_time - - # get current learning rates - current_lr = list(optimizer.param_groups)[0]["lr"] - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - keep_avg.update_values(update_train_values) - - # print training stats - if global_step % c.print_step == 0: - log_dict = { - "step_time": [step_time, 2], - "loader_time": [loader_time, 4], - "current_lr": current_lr, - "grad_norm": grad_norm.item(), - } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # plot step stats - if global_step % 10 == 0: - iter_stats = {"lr": current_lr, "grad_norm": grad_norm.item(), "step_time": step_time} - iter_stats.update(loss_dict) - tb_logger.tb_train_step_stats(global_step, iter_stats) - - # save checkpoint - if global_step % c.save_step == 0: - if c.checkpoint: - # save model - save_checkpoint( - model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None, - ) - - end_time = time.time() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Training Epoch Stats - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - if args.rank == 0: - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - # TODO: plot model stats - if c.tb_model_param_stats and args.rank == 0: - tb_logger.tb_model_weights(model, global_step) - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(model, criterion, ap, global_step, epoch): - data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0)) - model.eval() - epoch_time = 0 - keep_avg = KeepAverage() - end_time = time.time() - c_logger.print_eval_start() - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - m, x = format_data(data) - loader_time = time.time() - end_time - - global_step += 1 - - # compute noisy input - if hasattr(model, "module"): - noise, x_noisy, noise_scale = model.module.compute_y_n(x) - else: - noise, x_noisy, noise_scale = model.compute_y_n(x) - - # forward pass - noise_hat = model(x_noisy, m, noise_scale) - - # compute losses - loss = criterion(noise, noise_hat) - loss_wavegrad_dict = {"wavegrad_loss": loss} - - loss_dict = dict() - for key, value in loss_wavegrad_dict.items(): - if isinstance(value, (int, float)): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - step_time = time.time() - start_time - epoch_time += step_time - - # update avg stats - update_eval_values = dict() - for key, value in loss_dict.items(): - update_eval_values["avg_" + key] = value - update_eval_values["avg_loader_time"] = loader_time - update_eval_values["avg_step_time"] = step_time - keep_avg.update_values(update_eval_values) - - # print eval stats - if c.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - data_loader.dataset.return_segments = False - samples = data_loader.dataset.load_test_samples(1) - m, x = format_test_data(samples[0]) - - # setup noise schedule and inference - noise_schedule = c["test_noise_schedule"] - betas = np.linspace(noise_schedule["min_val"], noise_schedule["max_val"], noise_schedule["num_steps"]) - if hasattr(model, "module"): - model.module.compute_noise_level(betas) - # compute voice - x_pred = model.module.inference(m) - else: - model.compute_noise_level(betas) - # compute voice - x_pred = model.inference(m) - - # compute spectrograms - figures = plot_results(x_pred, x, ap, global_step, "eval") - tb_logger.tb_eval_figures(global_step, figures) - - # Sample audio - sample_voice = x_pred[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_eval_audios(global_step, {"eval/audio": sample_voice}, c.audio["sample_rate"]) - - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - data_loader.dataset.return_segments = True - - return keep_avg.avg_values - - -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global train_data, eval_data - print(f" > Loading wavs from: {c.data_path}") - if c.feature_path is not None: - print(f" > Loading features from: {c.feature_path}") - eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) - else: - eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) - - # setup audio processor - ap = AudioProcessor(**c.audio.to_dict()) - - # DISTRUBUTED - if num_gpus > 1: - init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) - - # setup models - model = setup_generator(c) - - # scaler for mixed_precision - scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None - - # setup optimizers - optimizer = Adam(model.parameters(), lr=c.lr, weight_decay=0) - - # schedulers - scheduler = None - if "lr_scheduler" in c: - scheduler = getattr(torch.optim.lr_scheduler, c.lr_scheduler) - scheduler = scheduler(optimizer, **c.lr_scheduler_params) - - # setup criterion - criterion = torch.nn.L1Loss().cuda() - - if use_cuda: - model.cuda() - criterion.cuda() - - if args.restore_path: - print(f" > Restoring from {os.path.basename(args.restore_path)}...") - checkpoint = torch.load(args.restore_path, map_location="cpu") - try: - print(" > Restoring Model...") - model.load_state_dict(checkpoint["model"]) - print(" > Restoring Optimizer...") - optimizer.load_state_dict(checkpoint["optimizer"]) - if "scheduler" in checkpoint: - print(" > Restoring LR Scheduler...") - scheduler.load_state_dict(checkpoint["scheduler"]) - # NOTE: Not sure if necessary - scheduler.optimizer = optimizer - if "scaler" in checkpoint and c.mixed_precision: - print(" > Restoring AMP Scaler...") - scaler.load_state_dict(checkpoint["scaler"]) - except RuntimeError: - # retore only matching layers. - print(" > Partial model initialization...") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], c) - model.load_state_dict(model_dict) - del model_dict - - # reset lr if not countinuining training. - for group in optimizer.param_groups: - group["lr"] = c.lr - - print(" > Model restored from step %d" % checkpoint["step"], flush=True) - args.restore_step = checkpoint["step"] - else: - args.restore_step = 0 - - # DISTRUBUTED - if num_gpus > 1: - model = DDP_th(model, device_ids=[args.rank]) - - num_params = count_parameters(model) - print(" > WaveGrad has {} parameters".format(num_params), flush=True) - - if args.restore_step == 0 or not args.best_path: - best_loss = float("inf") - print(" > Starting with inf best loss.") - else: - print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {best_loss}.") - keep_all_best = c.get("keep_all_best", False) - keep_after = c.get("keep_after", 10000) # void if keep_all_best False - - global_step = args.restore_step - for epoch in range(0, c.epochs): - c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train(model, criterion, optimizer, scheduler, scaler, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = eval_avg_loss_dict[c.target_loss] - best_loss = save_best_model( - target_loss, - best_loss, - model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - keep_all_best=keep_all_best, - keep_after=keep_after, - model_losses=eval_avg_loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None, - ) - - -if __name__ == "__main__": - args, c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/TTS/bin/train_vocoder_wavernn.py b/TTS/bin/train_vocoder_wavernn.py deleted file mode 100644 index 86a1506a..00000000 --- a/TTS/bin/train_vocoder_wavernn.py +++ /dev/null @@ -1,431 +0,0 @@ -#!/usr/bin/env python3 -"""Train WaveRNN vocoder model.""" - -import os -import random -import sys -import time -import traceback - -import torch -from torch.utils.data import DataLoader - -from TTS.tts.utils.visual import plot_spectrogram -from TTS.utils.arguments import init_training -from TTS.utils.audio import AudioProcessor -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.radam import RAdam -from TTS.utils.training import setup_torch_training_env -from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data -from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset -from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss -from TTS.vocoder.utils.generic_utils import setup_generator -from TTS.vocoder.utils.io import save_best_model, save_checkpoint - -# from torch.utils.data.distributed import DistributedSampler - - -use_cuda, num_gpus = setup_torch_training_env(True, True) - - -def setup_loader(ap, is_val=False, verbose=False): - if is_val and not c.run_eval: - loader = None - else: - dataset = WaveRNNDataset( - ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad=c.padding, - mode=c.mode, - mulaw=c.mulaw, - is_training=not is_val, - verbose=verbose, - ) - # sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - shuffle=True, - collate_fn=dataset.collate, - batch_size=c.batch_size, - num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers, - pin_memory=True, - ) - return loader - - -def format_data(data): - # setup input data - x_input = data[0] - mels = data[1] - y_coarse = data[2] - - # dispatch data to GPU - if use_cuda: - x_input = x_input.cuda(non_blocking=True) - mels = mels.cuda(non_blocking=True) - y_coarse = y_coarse.cuda(non_blocking=True) - - return x_input, mels, y_coarse - - -def train(model, optimizer, criterion, scheduler, scaler, ap, global_step, epoch): - # create train loader - data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) - model.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / c.batch_size) - end_time = time.time() - c_logger.print_train_start() - # train loop - for num_iter, data in enumerate(data_loader): - start_time = time.time() - x_input, mels, y_coarse = format_data(data) - loader_time = time.time() - end_time - global_step += 1 - - optimizer.zero_grad() - - if c.mixed_precision: - # mixed precision training - with torch.cuda.amp.autocast(): - y_hat = model(x_input, mels) - if isinstance(model.mode, int): - y_hat = y_hat.transpose(1, 2).unsqueeze(-1) - else: - y_coarse = y_coarse.float() - y_coarse = y_coarse.unsqueeze(-1) - # compute losses - loss = criterion(y_hat, y_coarse) - scaler.scale(loss).backward() - scaler.unscale_(optimizer) - if c.grad_clip > 0: - torch.nn.utils.clip_grad_norm_(model.parameters(), c.grad_clip) - scaler.step(optimizer) - scaler.update() - else: - # full precision training - y_hat = model(x_input, mels) - if isinstance(model.mode, int): - y_hat = y_hat.transpose(1, 2).unsqueeze(-1) - else: - y_coarse = y_coarse.float() - y_coarse = y_coarse.unsqueeze(-1) - # compute losses - loss = criterion(y_hat, y_coarse) - if loss.item() is None: - raise RuntimeError(" [!] None loss. Exiting ...") - loss.backward() - if c.grad_clip > 0: - torch.nn.utils.clip_grad_norm_(model.parameters(), c.grad_clip) - optimizer.step() - - if scheduler is not None: - scheduler.step() - - # get the current learning rate - cur_lr = list(optimizer.param_groups)[0]["lr"] - - step_time = time.time() - start_time - epoch_time += step_time - - update_train_values = dict() - loss_dict = dict() - loss_dict["model_loss"] = loss.item() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - keep_avg.update_values(update_train_values) - - # print training stats - if global_step % c.print_step == 0: - log_dict = { - "step_time": [step_time, 2], - "loader_time": [loader_time, 4], - "current_lr": cur_lr, - } - c_logger.print_train_step( - batch_n_iter, - num_iter, - global_step, - log_dict, - loss_dict, - keep_avg.avg_values, - ) - - # plot step stats - if global_step % 10 == 0: - iter_stats = {"lr": cur_lr, "step_time": step_time} - iter_stats.update(loss_dict) - tb_logger.tb_train_step_stats(global_step, iter_stats) - - # save checkpoint - if global_step % c.save_step == 0: - if c.checkpoint: - # save model - save_checkpoint( - model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None, - ) - - # synthesize a full voice - rand_idx = random.randrange(0, len(train_data)) - wav_path = ( - train_data[rand_idx] if not isinstance(train_data[rand_idx], (tuple, list)) else train_data[rand_idx][0] - ) - wav = ap.load_wav(wav_path) - ground_mel = ap.melspectrogram(wav) - ground_mel = torch.FloatTensor(ground_mel) - if use_cuda: - ground_mel = ground_mel.cuda(non_blocking=True) - sample_wav = model.inference( - ground_mel, - c.batched, - c.target_samples, - c.overlap_samples, - ) - predict_mel = ap.melspectrogram(sample_wav) - - # compute spectrograms - figures = { - "train/ground_truth": plot_spectrogram(ground_mel.T), - "train/prediction": plot_spectrogram(predict_mel.T), - } - tb_logger.tb_train_figures(global_step, figures) - - # Sample audio - tb_logger.tb_train_audios(global_step, {"train/audio": sample_wav}, c.audio["sample_rate"]) - end_time = time.time() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Training Epoch Stats - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - # TODO: plot model stats - # if c.tb_model_param_stats: - # tb_logger.tb_model_weights(model, global_step) - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(model, criterion, ap, global_step, epoch): - # create train loader - data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0)) - model.eval() - epoch_time = 0 - keep_avg = KeepAverage() - end_time = time.time() - c_logger.print_eval_start() - with torch.no_grad(): - for num_iter, data in enumerate(data_loader): - start_time = time.time() - # format data - x_input, mels, y_coarse = format_data(data) - loader_time = time.time() - end_time - global_step += 1 - - y_hat = model(x_input, mels) - if isinstance(model.mode, int): - y_hat = y_hat.transpose(1, 2).unsqueeze(-1) - else: - y_coarse = y_coarse.float() - y_coarse = y_coarse.unsqueeze(-1) - loss = criterion(y_hat, y_coarse) - # Compute avg loss - # if num_gpus > 1: - # loss = reduce_tensor(loss.data, num_gpus) - loss_dict = dict() - loss_dict["model_loss"] = loss.item() - - step_time = time.time() - start_time - epoch_time += step_time - - # update avg stats - update_eval_values = dict() - for key, value in loss_dict.items(): - update_eval_values["avg_" + key] = value - update_eval_values["avg_loader_time"] = loader_time - update_eval_values["avg_step_time"] = step_time - keep_avg.update_values(update_eval_values) - - # print eval stats - if c.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if epoch % c.test_every_epochs == 0 and epoch != 0: - # synthesize a full voice - rand_idx = random.randrange(0, len(eval_data)) - wav_path = eval_data[rand_idx] if not isinstance(eval_data[rand_idx], (tuple, list)) else eval_data[rand_idx][0] - wav = ap.load_wav(wav_path) - ground_mel = ap.melspectrogram(wav) - ground_mel = torch.FloatTensor(ground_mel) - if use_cuda: - ground_mel = ground_mel.cuda(non_blocking=True) - sample_wav = model.inference( - ground_mel, - c.batched, - c.target_samples, - c.overlap_samples, - ) - predict_mel = ap.melspectrogram(sample_wav) - - # Sample audio - tb_logger.tb_eval_audios(global_step, {"eval/audio": sample_wav}, c.audio["sample_rate"]) - - # compute spectrograms - figures = { - "eval/ground_truth": plot_spectrogram(ground_mel.T), - "eval/prediction": plot_spectrogram(predict_mel.T), - } - tb_logger.tb_eval_figures(global_step, figures) - - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - return keep_avg.avg_values - - -# FIXME: move args definition/parsing inside of main? -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global train_data, eval_data - - # setup audio processor - ap = AudioProcessor(**c.audio.to_dict()) - - print(f" > Loading wavs from: {c.data_path}") - if c.feature_path is not None: - print(f" > Loading features from: {c.feature_path}") - eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) - else: - eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) - # setup model - model_wavernn = setup_generator(c) - - # setup amp scaler - scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None - - # define train functions - if c.mode == "mold": - criterion = discretized_mix_logistic_loss - elif c.mode == "gauss": - criterion = gaussian_loss - elif isinstance(c.mode, int): - criterion = torch.nn.CrossEntropyLoss() - - if use_cuda: - model_wavernn.cuda() - if isinstance(c.mode, int): - criterion.cuda() - - optimizer = RAdam(model_wavernn.parameters(), lr=c.lr, weight_decay=0) - - scheduler = None - if "lr_scheduler" in c: - scheduler = getattr(torch.optim.lr_scheduler, c.lr_scheduler) - scheduler = scheduler(optimizer, **c.lr_scheduler_params) - # slow start for the first 5 epochs - # lr_lambda = lambda epoch: min(epoch / c.warmup_steps, 1) - # scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) - - # restore any checkpoint - if args.restore_path: - print(f" > Restoring from {os.path.basename(args.restore_path)}...") - checkpoint = torch.load(args.restore_path, map_location="cpu") - try: - print(" > Restoring Model...") - model_wavernn.load_state_dict(checkpoint["model"]) - print(" > Restoring Optimizer...") - optimizer.load_state_dict(checkpoint["optimizer"]) - if "scheduler" in checkpoint: - print(" > Restoring Generator LR Scheduler...") - scheduler.load_state_dict(checkpoint["scheduler"]) - scheduler.optimizer = optimizer - if "scaler" in checkpoint and c.mixed_precision: - print(" > Restoring AMP Scaler...") - scaler.load_state_dict(checkpoint["scaler"]) - except RuntimeError: - # retore only matching layers. - print(" > Partial model initialization...") - model_dict = model_wavernn.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], c) - model_wavernn.load_state_dict(model_dict) - - print(" > Model restored from step %d" % checkpoint["step"], flush=True) - args.restore_step = checkpoint["step"] - else: - args.restore_step = 0 - - # DISTRIBUTED - # if num_gpus > 1: - # model = apply_gradient_allreduce(model) - - num_parameters = count_parameters(model_wavernn) - print(" > Model has {} parameters".format(num_parameters), flush=True) - - if args.restore_step == 0 or not args.best_path: - best_loss = float("inf") - print(" > Starting with inf best loss.") - else: - print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {best_loss}.") - keep_all_best = c.get("keep_all_best", False) - keep_after = c.get("keep_after", 10000) # void if keep_all_best False - - global_step = args.restore_step - for epoch in range(0, c.epochs): - c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train(model_wavernn, optimizer, criterion, scheduler, scaler, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(model_wavernn, criterion, ap, global_step, epoch) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = eval_avg_loss_dict["avg_model_loss"] - best_loss = save_best_model( - target_loss, - best_loss, - model_wavernn, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - keep_all_best=keep_all_best, - keep_after=keep_after, - model_losses=eval_avg_loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None, - ) - - -if __name__ == "__main__": - args, c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/TTS/trainer.py b/TTS/trainer.py index 5c02fdfb..8b7be3d1 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -1,22 +1,52 @@ # -*- coding: utf-8 -*- +import glob import importlib -from abc import ABC, abstractmethod +import logging +import os +import re +import sys +import time +import traceback +from argparse import Namespace from dataclasses import dataclass, field -from typing import Dict, List, Tuple, TypeVar +from typing import Dict, List, Tuple, Union import torch from coqpit import Coqpit - -# DISTRIBUTED from torch import nn +from torch.nn.parallel import DistributedDataParallel as DDP_th +from torch.utils.data import DataLoader -_DataLoader = TypeVar("_DataLoader") +from TTS.config import load_config +from TTS.tts.datasets import load_meta_data +from TTS.tts.models import setup_model as setup_tts_model +from TTS.tts.utils.text.symbols import parse_symbols +from TTS.utils.audio import AudioProcessor +from TTS.utils.callbacks import TrainerCallback +from TTS.utils.distribute import init_distributed +from TTS.utils.generic_utils import ( + KeepAverage, + count_parameters, + create_experiment_folder, + get_git_branch, + remove_experiment_folder, + set_init_dict, + to_cuda, +) +from TTS.utils.io import copy_model_files, save_best_model, save_checkpoint +from TTS.utils.logging import ConsoleLogger, TensorboardLogger +from TTS.utils.trainer_utils import * +from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data +from TTS.vocoder.models import setup_model as setup_vocoder_model + +if is_apex_available(): + from apex import amp @dataclass class TrainingArgs(Coqpit): - """Trainer arguments that are parsed externally (e.g. CLI)""" + """Trainer arguments""" continue_path: str = field( default="", @@ -41,101 +71,926 @@ class TrainingArgs(Coqpit): group_id: str = field(default="", metadata={"help": "Process group id in distributed training."}) -# pylint: disable=import-outside-toplevel, too-many-public-methods +class Trainer: + def __init__( + self, + args: Union[Coqpit, Namespace], + config: Coqpit, + output_path: str, + c_logger: ConsoleLogger = None, + tb_logger: TensorboardLogger = None, + model: nn.Module = None, + cudnn_benchmark: bool = False, + ) -> None: + """Simple yet powerful 🐸💬 TTS trainer for PyTorch. It can train all the available `tts` and `vocoder` models + or easily be customized. + Notes: -class TrainerAbstract(ABC): + Supports Automatic Mixed Precision training. If `Apex` is availabe, it automatically picks that, else + it uses PyTorch's native `amp` module. `Apex` may provide more stable training in some cases. + + Args: + + args (Union[Coqpit, Namespace]): Training arguments parsed either from console by `argparse` or `TrainingArgs` + config object. + + config (Coqpit): Model config object. It includes all the values necessary for initializing, training, evaluating + and testing the model. + + output_path (str): Path to the output training folder. All the files are saved under thi path. + + c_logger (ConsoleLogger, optional): Console logger for printing training status. If not provided, the default + console logger is used. Defaults to None. + + tb_logger (TensorboardLogger, optional): Tensorboard logger. If not provided, the default logger is used. + Defaults to None. + + model (nn.Module, optional): Initialized and ready-to-train model. If it is not defined, `Trainer` + initializes a model from the provided config. Defaults to None. + + cudnn_benchmark (bool): enable/disable PyTorch cudnn benchmarking. It is better to disable if the model input + length is changing batch to batch along the training. + + Examples: + + Running trainer on a model. + + >>> args = TrainingArgs(...) + >>> config = HifiganConfig(...) + >>> model = GANModel(config) + >>> trainer = Trainer(args, config, output_path, model=model) + >>> trainer.fit() + + Running trainer on a config. + + >>> config = WavegradConfig(data_path="/home/erogol/nvme/gdrive/Datasets/LJSpeech-1.1/wavs/", output_path=output_path,) + >>> args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) + >>> trainer = Trainer(args, config, output_path, c_logger, tb_logger) + >>> trainer.fit() + + TODO: + - Accumulate gradients b/w batches. + - Deepspeed integration + - Profiler integration. + - Overfitting to a batch. + - TPU training + """ + + # set and initialize Pytorch runtime + self.use_cuda, self.num_gpus = setup_torch_training_env(True, cudnn_benchmark) + + if config is None: + # parse config from console arguments + config, output_path, _, c_logger, tb_logger = process_args(args) + + self.output_path = output_path + self.args = args + self.config = config + + # init loggers + self.c_logger = ConsoleLogger() if c_logger is None else c_logger + if tb_logger is None: + self.tb_logger = TensorboardLogger(output_path, model_name=config.model) + self.tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) + else: + self.tb_logger = tb_logger + log_file = os.path.join(self.output_path, f"trainer_{args.rank}_log.txt") + self._setup_logger_config(log_file) + + self.total_steps_done = 0 + self.epochs_done = 0 + self.restore_step = 0 + self.best_loss = float("inf") + self.train_loader = None + self.eval_loader = None + self.output_audio_path = os.path.join(output_path, "test_audios") + + self.keep_avg_train = None + self.keep_avg_eval = None + + self.use_apex = self._is_apex_available() + self.use_amp_scaler = self.config.mixed_precision and self.use_cuda + + # init audio processor + self.ap = AudioProcessor(**self.config.audio.to_dict()) + + # load dataset samples + # TODO: refactor this + if "datasets" in self.config: + # load data for `tts` models + self.data_train, self.data_eval = load_meta_data(self.config.datasets) + elif self.config.feature_path is not None: + # load data for `vocoder`models + print(f" > Loading features from: {self.config.feature_path}") + self.data_eval, self.data_train = load_wav_feat_data( + self.config.data_path, self.config.feature_path, self.config.eval_split_size + ) + else: + # load data for `vocoder`models + self.data_eval, self.data_train = load_wav_data(self.config.data_path, self.config.eval_split_size) + + # init TTS model + if model is not None: + self.model = model + else: + self.model = self.get_model(self.config) + + # setup criterion + self.criterion = self.get_criterion(self.model) + + # DISTRUBUTED + if self.num_gpus > 1: + init_distributed( + args.rank, + self.num_gpus, + args.group_id, + self.config.distributed_backend, + self.config.distributed_url, + ) + + if self.use_cuda: + self.model.cuda() + if isinstance(self.criterion, list): + self.criterion = [x.cuda() for x in self.criterion] + else: + self.criterion.cuda() + + # setup optimizer + self.optimizer = self.get_optimizer(self.model, self.config) + + # callback + self.callbacks = TrainerCallback(self) + self.callbacks.on_init_start() + + # init AMP + if self.use_amp_scaler: + if self.use_apex: + self.scaler = None + self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level="O1") + if isinstance(self.optimizer, list): + self.scaler = [torch.cuda.amp.GradScaler()] * len(self.optimizer) + else: + self.scaler = torch.cuda.amp.GradScaler() + else: + self.scaler = None + + if self.args.restore_path: + self.model, self.optimizer, self.scaler, self.restore_step = self.restore_model( + self.config, args.restore_path, self.model, self.optimizer, self.scaler + ) + + # setup scheduler + self.scheduler = self.get_scheduler(self.model, self.config, self.optimizer) + + # DISTRUBUTED + if self.num_gpus > 1: + self.model = DDP_th(self.model, device_ids=[args.rank], output_device=args.rank) + + # count model size + num_params = count_parameters(self.model) + print("\n > Model has {} parameters".format(num_params)) + + self.callbacks.on_init_end() + + @staticmethod + def get_model(config: Coqpit) -> nn.Module: + """Initialize model from config. + + Args: + config (Coqpit): Model config. + + Returns: + nn.Module: initialized model. + """ + # TODO: better model setup + try: + model = setup_tts_model(config) + except ModuleNotFoundError: + model = setup_vocoder_model(config) + return model + + def restore_model( + self, + config: Coqpit, + restore_path: str, + model: nn.Module, + optimizer: torch.optim.Optimizer, + scaler: torch.cuda.amp.GradScaler = None, + ) -> Tuple[nn.Module, torch.optim.Optimizer, torch.cuda.amp.GradScaler, int]: + """Restore training from an old run. It restores model, optimizer, AMP scaler and training stats. + + Args: + config (Coqpit): Model config. + restore_path (str): Path to the restored training run. + model (nn.Module): Model to restored. + optimizer (torch.optim.Optimizer): Optimizer to restore. + scaler (torch.cuda.amp.GradScaler, optional): AMP scaler to restore. Defaults to None. + + Returns: + Tuple[nn.Module, torch.optim.Optimizer, torch.cuda.amp.GradScaler, int]: [description] + """ + + def _restore_list_objs(states, obj): + if isinstance(obj, list): + for idx, state in enumerate(states): + obj[idx].load_state_dict(state) + else: + obj.load_state_dict(states) + return obj + + print(" > Restoring from %s ..." % os.path.basename(restore_path)) + checkpoint = torch.load(restore_path) + try: + print(" > Restoring Model...") + model.load_state_dict(checkpoint["model"]) + print(" > Restoring Optimizer...") + optimizer = _restore_list_objs(checkpoint["optimizer"], optimizer) + if "scaler" in checkpoint and self.use_amp_scaler: + print(" > Restoring AMP Scaler...") + scaler = _restore_list_objs(checkpoint["scaler"], scaler) + except (KeyError, RuntimeError): + print(" > Partial model initialization...") + model_dict = model.state_dict() + model_dict = set_init_dict(model_dict, checkpoint["model"], config) + model.load_state_dict(model_dict) + del model_dict + + if isinstance(self.optimizer, list): + for idx, optim in enumerate(optimizer): + for group in optim.param_groups: + group["lr"] = self.get_lr(model, config)[idx] + else: + for group in optimizer.param_groups: + group["lr"] = self.get_lr(model, config) + print( + " > Model restored from step %d" % checkpoint["step"], + ) + restore_step = checkpoint["step"] + return model, optimizer, scaler, restore_step + + @staticmethod + def _get_loader( + model: nn.Module, + config: Coqpit, + ap: AudioProcessor, + is_eval: bool, + data_items: List, + verbose: bool, + num_gpus: int, + ) -> DataLoader: + if hasattr(model, "get_data_loader"): + loader = model.get_data_loader(config, ap, is_eval, data_items, verbose, num_gpus) + return loader + + def get_train_dataloader(self, ap: AudioProcessor, data_items: List, verbose: bool) -> DataLoader: + """Initialize and return a training data loader. + + Args: + ap (AudioProcessor): Audio processor. + data_items (List): Data samples used for training. + verbose (bool): enable/disable printing loader stats at initialization. + + Returns: + DataLoader: Initialized training data loader. + """ + return self._get_loader(self.model, self.config, ap, False, data_items, verbose, self.num_gpus) + + def get_eval_dataloader(self, ap: AudioProcessor, data_items: List, verbose: bool) -> DataLoader: + return self._get_loader(self.model, self.config, ap, True, data_items, verbose, self.num_gpus) + + def format_batch(self, batch: List) -> Dict: + """Format dataloader ouput and return a batch. + + Args: + batch (List): Batch returned by the dataloader. + + Returns: + Dict: Formatted batch. + """ + batch = self.model.format_batch(batch) + if self.use_cuda: + for k, v in batch.items(): + batch[k] = to_cuda(v) + return batch + + @staticmethod + def _model_train_step( + batch: Dict, model: nn.Module, criterion: nn.Module, optimizer_idx: int = None + ) -> Tuple[Dict, Dict]: + """ + Perform a trainig forward step. Compute model outputs and losses. + + Args: + batch (Dict): [description] + model (nn.Module): [description] + criterion (nn.Module): [description] + optimizer_idx (int, optional): [description]. Defaults to None. + + Returns: + Tuple[Dict, Dict]: [description] + """ + input_args = [batch, criterion] + if optimizer_idx is not None: + input_args.append(optimizer_idx) + # unwrap model in DDP training + if hasattr(model, "module"): + return model.module.train_step(*input_args) + return model.train_step(*input_args) + + def _optimize( + self, + batch: Dict, + model: nn.Module, + optimizer: Union[torch.optim.Optimizer, List], + scaler: "AMPScaler", + criterion: nn.Module, + scheduler: Union[torch.optim.lr_scheduler._LRScheduler, List], # pylint: disable=protected-access + config: Coqpit, + optimizer_idx: int = None, + ) -> Tuple[Dict, Dict, int, torch.Tensor]: + """Perform a forward - backward pass and run the optimizer. + + Args: + batch (Dict): Input batch. If + model (nn.Module): Model for training. Defaults to None. + optimizer (Union[nn.optim.Optimizer, List]): Model's optimizer. If it is a list then, `optimizer_idx` must be defined to indicate the optimizer in use. + scaler (AMPScaler): AMP scaler. + criterion (nn.Module): Model's criterion. + scheduler (Union[torch.optim.lr_scheduler._LRScheduler, List]): LR scheduler used by the optimizer. + config (Coqpit): Model config. + optimizer_idx (int, optional): Target optimizer being used. Defaults to None. + + Raises: + RuntimeError: When the loss is NaN. + + Returns: + Tuple[Dict, Dict, int, torch.Tensor]: model outputs, losses, step time and gradient norm. + """ + step_start_time = time.time() + # zero-out optimizer + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=config.mixed_precision): + if optimizer_idx is not None: + outputs, loss_dict = self._model_train_step(batch, model, criterion, optimizer_idx=optimizer_idx) + else: + outputs, loss_dict = self._model_train_step(batch, model, criterion) + + # skip the rest + if outputs is None: + step_time = time.time() - step_start_time + return None, {}, step_time, 0 + + # check nan loss + if torch.isnan(loss_dict["loss"]).any(): + raise RuntimeError(f"Detected NaN loss at step {self.total_steps_done}.") + + # set gradient clipping threshold + if "grad_clip" in config and config.grad_clip is not None: + if optimizer_idx is not None: + grad_clip = config.grad_clip[optimizer_idx] + else: + grad_clip = config.grad_clip + else: + grad_clip = 0.0 # meaning no gradient clipping + + # TODO: compute grad norm + if grad_clip <= 0: + grad_norm = 0 + + # optimizer step + update_lr_scheduler = True + if self.use_amp_scaler: + if self.use_apex: + with amp.scale_loss(loss_dict["loss"], self.optimizer) as scaled_loss: + scaled_loss.backward() + grad_norm = torch.nn.utils.clip_grad_norm_( + amp.master_params(self.optimizer), + self.config.grad_clip, + ) + else: + # model optimizer step in mixed precision mode + scaler.scale(loss_dict["loss"]).backward() + scaler.unscale_(optimizer) + if grad_clip > 0: + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) + scale_prev = scaler.get_scale() + scaler.step(optimizer) + scaler.update() + update_lr_scheduler = scale_prev <= scaler.get_scale() + else: + # main model optimizer step + loss_dict["loss"].backward() + if grad_clip > 0: + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) + optimizer.step() + + step_time = time.time() - step_start_time + + # setup lr + if scheduler is not None and update_lr_scheduler: + scheduler.step() + + # detach losses + loss_dict = self._detach_loss_dict(loss_dict) + if optimizer_idx is not None: + loss_dict[f"loss_{optimizer_idx}"] = loss_dict.pop("loss") + loss_dict[f"grad_norm_{optimizer_idx}"] = grad_norm + return outputs, loss_dict, step_time, grad_norm + + @staticmethod + def _detach_loss_dict(loss_dict: Dict) -> Dict: + """Detach loss values from autograp. + + Args: + loss_dict (Dict): losses. + + Returns: + Dict: losses detached from autograph. + """ + loss_dict_detached = {} + for key, value in loss_dict.items(): + if isinstance(value, (int, float)): + loss_dict_detached[key] = value + else: + loss_dict_detached[key] = value.item() + return loss_dict_detached + + def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_time: float) -> Tuple[Dict, Dict]: + """Perform a training step on a batch of inputs and log the process. + + Args: + batch (Dict): Input batch. + batch_n_steps (int): Number of steps needed to complete an epoch. Needed for logging. + step (int): Current step number in this epoch. + loader_start_time (float): The time when the data loading is started. Needed for logging. + + Returns: + Tuple[Dict, Dict]: Model outputs and losses. + """ + self.callbacks.on_train_step_start() + # format data + batch = self.format_batch(batch) + loader_time = time.time() - loader_start_time + + # conteainers to hold model outputs and losses for each optimizer. + outputs_per_optimizer = None + log_dict = {} + loss_dict = {} + if not isinstance(self.optimizer, list): + # training with a single optimizer + outputs, loss_dict_new, step_time, grad_norm = self._optimize( + batch, self.model, self.optimizer, self.scaler, self.criterion, self.scheduler, self.config + ) + loss_dict.update(loss_dict_new) + else: + # training with multiple optimizers (e.g. GAN) + outputs_per_optimizer = [None] * len(self.optimizer) + total_step_time = 0 + for idx, optimizer in enumerate(self.optimizer): + criterion = self.criterion + scaler = self.scaler[idx] if self.use_amp_scaler else None + scheduler = self.scheduler[idx] + outputs, loss_dict_new, step_time, grad_norm = self._optimize( + batch, self.model, optimizer, scaler, criterion, scheduler, self.config, idx + ) + # skip the rest if the model returns None + total_step_time += step_time + outputs_per_optimizer[idx] = outputs + # if None, model skipped this optimizer + if loss_dict_new is not None: + loss_dict.update(loss_dict_new) + outputs = outputs_per_optimizer + + # update avg stats + keep_avg_update = dict() + for key, value in log_dict.items(): + keep_avg_update["avg_" + key] = value + keep_avg_update["avg_loader_time"] = loader_time + keep_avg_update["avg_step_time"] = step_time + self.keep_avg_train.update_values(keep_avg_update) + + # print training progress + if self.total_steps_done % self.config.print_step == 0: + # log learning rates + lrs = {} + if isinstance(self.optimizer, list): + for idx, optimizer in enumerate(self.optimizer): + current_lr = self.optimizer[idx].param_groups[0]["lr"] + lrs.update({f"current_lr_{idx}": current_lr}) + else: + current_lr = self.optimizer.param_groups[0]["lr"] + lrs = {"current_lr": current_lr} + log_dict.update(lrs) + if grad_norm > 0: + log_dict.update({"grad_norm": grad_norm}) + # log run-time stats + log_dict.update( + { + "step_time": round(step_time, 4), + "loader_time": round(loader_time, 4), + } + ) + self.c_logger.print_train_step( + batch_n_steps, step, self.total_steps_done, log_dict, loss_dict, self.keep_avg_train.avg_values + ) + + if self.args.rank == 0: + # Plot Training Iter Stats + # reduce TB load and don't log every step + if self.total_steps_done % self.config.tb_plot_step == 0: + iter_stats = log_dict + iter_stats.update(loss_dict) + self.tb_logger.tb_train_step_stats(self.total_steps_done, iter_stats) + if self.total_steps_done % self.config.save_step == 0 and self.total_steps_done != 0: + if self.config.checkpoint: + # checkpoint the model + model_loss = ( + loss_dict[self.config.target_loss] if "target_loss" in self.config else loss_dict["loss"] + ) + save_checkpoint( + self.config, + self.model, + self.optimizer, + self.scaler if self.use_amp_scaler else None, + self.total_steps_done, + self.epochs_done, + self.output_path, + model_loss=model_loss, + ) + # training visualizations + figures, audios = None, None + if hasattr(self.model, "module") and hasattr(self.model.module, "train_log"): + figures, audios = self.model.module.train_log(self.ap, batch, outputs) + elif hasattr(self.model, "train_log"): + figures, audios = self.model.train_log(self.ap, batch, outputs) + if figures is not None: + self.tb_logger.tb_train_figures(self.total_steps_done, figures) + if audios is not None: + self.tb_logger.tb_train_audios(self.total_steps_done, audios, self.ap.sample_rate) + self.total_steps_done += 1 + self.callbacks.on_train_step_end() + return outputs, loss_dict + + def train_epoch(self) -> None: + """Main entry point for training. Run training on the whole training samples.""" + self.train_loader = self.get_train_dataloader( + self.ap, + self.data_train, + verbose=True, + ) + self.model.train() + epoch_start_time = time.time() + if self.use_cuda: + batch_num_steps = int(len(self.train_loader.dataset) / (self.config.batch_size * self.num_gpus)) + else: + batch_num_steps = int(len(self.train_loader.dataset) / self.config.batch_size) + self.c_logger.print_train_start() + for cur_step, batch in enumerate(self.train_loader): + loader_start_time = time.time() + _, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time) + epoch_time = time.time() - epoch_start_time + # Plot self.epochs_done Stats + if self.args.rank == 0: + epoch_stats = {"epoch_time": epoch_time} + epoch_stats.update(self.keep_avg_train.avg_values) + self.tb_logger.tb_train_epoch_stats(self.total_steps_done, epoch_stats) + if self.config.tb_model_param_stats: + self.tb_logger.tb_model_weights(self.model, self.total_steps_done) + + @staticmethod + def _model_eval_step( + batch: Dict, model: nn.Module, criterion: nn.Module, optimizer_idx: int = None + ) -> Tuple[Dict, Dict]: + """ + Perform a evaluation forward pass. Compute model outputs and losses with no gradients. + + Args: + batch (Dict): IBatch of inputs. + model (nn.Module): Model to call evaluation. + criterion (nn.Module): Model criterion. + optimizer_idx (int, optional): Optimizer ID to define the closure in multi-optimizer training. Defaults to None. + + Returns: + Tuple[Dict, Dict]: model outputs and losses. + """ + input_args = [batch, criterion] + if optimizer_idx is not None: + input_args.append(optimizer_idx) + if hasattr(model, "module"): + return model.module.eval_step(*input_args) + return model.eval_step(*input_args) + + def eval_step(self, batch: Dict, step: int) -> Tuple[Dict, Dict]: + with torch.no_grad(): + outputs_per_optimizer = None + loss_dict = {} + if not isinstance(self.optimizer, list): + outputs, loss_dict = self._model_eval_step(batch, self.model, self.criterion) + else: + outputs_per_optimizer = [None] * len(self.optimizer) + for idx, _ in enumerate(self.optimizer): + criterion = self.criterion + outputs, loss_dict_new = self._model_eval_step(batch, self.model, criterion, idx) + outputs_per_optimizer[idx] = outputs + if loss_dict_new is not None: + loss_dict.update(loss_dict_new) + outputs = outputs_per_optimizer + + # update avg stats + update_eval_values = dict() + for key, value in loss_dict.items(): + update_eval_values["avg_" + key] = value + self.keep_avg_eval.update_values(update_eval_values) + + if self.config.print_eval: + self.c_logger.print_eval_step(step, loss_dict, self.keep_avg_eval.avg_values) + return outputs, loss_dict + + def eval_epoch(self) -> None: + self.eval_loader = ( + self.get_eval_dataloader( + self.ap, + self.data_eval, + verbose=True, + ) + if self.config.run_eval + else None + ) + + self.model.eval() + self.c_logger.print_eval_start() + loader_start_time = time.time() + batch = None + for cur_step, batch in enumerate(self.eval_loader): + # format data + batch = self.format_batch(batch) + loader_time = time.time() - loader_start_time + self.keep_avg_eval.update_values({"avg_loader_time": loader_time}) + outputs, _ = self.eval_step(batch, cur_step) + # plot epoch stats, artifacts and figures + if self.args.rank == 0: + figures, audios = None, None + if hasattr(self.model, "module") and hasattr(self.model.module, "eval_log"): + figures, audios = self.model.module.eval_log(self.ap, batch, outputs) + elif hasattr(self.model, "eval_log"): + figures, audios = self.model.eval_log(self.ap, batch, outputs) + if figures is not None: + self.tb_logger.tb_eval_figures(self.total_steps_done, figures) + if audios is not None: + self.tb_logger.tb_eval_audios(self.total_steps_done, audios, self.ap.sample_rate) + + def test_run(self) -> None: + """Run test and log the results. Test run must be defined by the model. + Model must return figures and audios to be logged by the Tensorboard logger.""" + if hasattr(self.model, "test_run"): + if hasattr(self.eval_loader.load_test_samples): + samples = self.eval_loader.load_test_samples(1) + figures, audios = self.model.test_run(samples) + else: + figures, audios = self.model.test_run() + self.tb_logger.tb_test_audios(self.total_steps_done, audios, self.config.audio["sample_rate"]) + self.tb_logger.tb_test_figures(self.total_steps_done, figures) + + def _fit(self) -> None: + """🏃 train -> evaluate -> test for the number of epochs.""" + if self.restore_step != 0 or self.args.best_path: + print(" > Restoring best loss from " f"{os.path.basename(self.args.best_path)} ...") + self.best_loss = torch.load(self.args.best_path, map_location="cpu")["model_loss"] + print(f" > Starting with loaded last best loss {self.best_loss}.") + + self.total_steps_done = self.restore_step + + for epoch in range(0, self.config.epochs): + self.callbacks.on_epoch_start() + self.keep_avg_train = KeepAverage() + self.keep_avg_eval = KeepAverage() if self.config.run_eval else None + self.epochs_done = epoch + self.c_logger.print_epoch_start(epoch, self.config.epochs) + self.train_epoch() + if self.config.run_eval: + self.eval_epoch() + if epoch >= self.config.test_delay_epochs and self.args.rank < 0: + self.test_run() + self.c_logger.print_epoch_end( + epoch, self.keep_avg_eval.avg_values if self.config.run_eval else self.keep_avg_train.avg_values + ) + self.save_best_model() + self.callbacks.on_epoch_end() + + def fit(self) -> None: + """Where the ✨️magic✨️ happens...""" + try: + self._fit() + except KeyboardInterrupt: + self.callbacks.on_keyboard_interrupt() + # if the output folder is empty remove the run. + remove_experiment_folder(self.output_path) + # stop without error signal + try: + sys.exit(0) + except SystemExit: + os._exit(0) # pylint: disable=protected-access + except BaseException: # pylint: disable=broad-except + remove_experiment_folder(self.output_path) + traceback.print_exc() + sys.exit(1) + + def save_best_model(self) -> None: + """Save the best model. It only saves if the current target loss is smaller then the previous.""" + self.best_loss = save_best_model( + self.keep_avg_eval["avg_loss"] if self.keep_avg_eval else self.keep_avg_train["avg_loss"], + self.best_loss, + self.config, + self.model, + self.optimizer, + self.scaler if self.use_amp_scaler else None, + self.total_steps_done, + self.epochs_done, + self.output_path, + keep_all_best=self.config.keep_all_best, + keep_after=self.config.keep_after, + ) + + @staticmethod + def _setup_logger_config(log_file: str) -> None: + logging.basicConfig( + level=logging.INFO, format="", handlers=[logging.FileHandler(log_file), logging.StreamHandler()] + ) @staticmethod def _is_apex_available(): return importlib.util.find_spec("apex") is not None @staticmethod - @abstractmethod - def get_model(*args, **kwargs) -> nn.Module: - pass + def get_optimizer(model: nn.Module, config: Coqpit) -> Union[torch.optim.Optimizer, List]: + if hasattr(model, "get_optimizer"): + optimizer = model.get_optimizer() + if optimizer is None: + optimizer_name = config.optimizer + optimizer_params = config.optimizer_params + return get_optimizer(optimizer_name, optimizer_params, config.lr, model) + return optimizer @staticmethod - @abstractmethod - def get_optimizer(model: nn.Module, config: Coqpit) -> torch.optim.Optimizer: - pass + def get_lr(model: nn.Module, config: Coqpit) -> Union[float, List[float]]: + lr = None + if hasattr(model, "get_lr"): + lr = model.get_lr() + if lr is None: + lr = config.lr + return lr @staticmethod - @abstractmethod def get_scheduler( - config: Coqpit, optimizer: torch.optim.Optimizer - ) -> torch.optim.lr_scheduler._LRScheduler: # pylint: disable=protected-access - pass + model: nn.Module, config: Coqpit, optimizer: Union[torch.optim.Optimizer, List] + ) -> Union[torch.optim.lr_scheduler._LRScheduler, List]: # pylint: disable=protected-access + scheduler = None + if hasattr(model, "get_scheduler"): + scheduler = model.get_scheduler(optimizer) + if scheduler is None: + lr_scheduler = config.lr_scheduler + lr_scheduler_params = config.lr_scheduler_params + return get_scheduler(lr_scheduler, lr_scheduler_params, optimizer) + return scheduler @staticmethod - @abstractmethod - def get_criterion(config: Coqpit) -> nn.Module: - pass + def get_criterion(model: nn.Module) -> nn.Module: + criterion = None + criterion = model.get_criterion() + return criterion - @abstractmethod - def restore_model(self, *args, **kwargs) -> Tuple: - pass - @abstractmethod - def get_train_dataloader(self, *args, **kwargs) -> _DataLoader: - pass +def init_arguments(): + train_config = TrainingArgs() + parser = train_config.init_argparse(arg_prefix="") + return parser - @abstractmethod - def get_eval_dataloder(self, *args, **kwargs) -> _DataLoader: - pass - @abstractmethod - def format_batch(self, batch: List) -> Dict: - pass +def get_last_checkpoint(path): + """Get latest checkpoint or/and best model in path. - @abstractmethod - def _train_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: - pass + It is based on globbing for `*.pth.tar` and the RegEx + `(checkpoint|best_model)_([0-9]+)`. - @abstractmethod - def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_time: float) -> Tuple[Dict, Dict]: - pass + Args: + path (list): Path to files to be compared. - @abstractmethod - def train_epoch(self) -> None: - pass + Raises: + ValueError: If no checkpoint or best_model files are found. - @abstractmethod - def _eval_step(self, batch: Dict) -> Tuple[Dict, Dict]: - pass + Returns: + last_checkpoint (str): Last checkpoint filename. + """ + file_names = glob.glob(os.path.join(path, "*.pth.tar")) + last_models = {} + last_model_nums = {} + for key in ["checkpoint", "best_model"]: + last_model_num = None + last_model = None + # pass all the checkpoint files and find + # the one with the largest model number suffix. + for file_name in file_names: + match = re.search(f"{key}_([0-9]+)", file_name) + if match is not None: + model_num = int(match.groups()[0]) + if last_model_num is None or model_num > last_model_num: + last_model_num = model_num + last_model = file_name - @abstractmethod - def eval_step(self, batch: Dict, step: int) -> Tuple[Dict, Dict]: - pass + # if there is not checkpoint found above + # find the checkpoint with the latest + # modification date. + key_file_names = [fn for fn in file_names if key in fn] + if last_model is None and len(key_file_names) > 0: + last_model = max(key_file_names, key=os.path.getctime) + last_model_num = torch.load(last_model)["step"] - @abstractmethod - def eval_epoch(self) -> None: - pass + if last_model is not None: + last_models[key] = last_model + last_model_nums[key] = last_model_num - @abstractmethod - def test_run(self) -> None: - pass + # check what models were found + if not last_models: + raise ValueError(f"No models found in continue path {path}!") + if "checkpoint" not in last_models: # no checkpoint just best model + last_models["checkpoint"] = last_models["best_model"] + elif "best_model" not in last_models: # no best model + # this shouldn't happen, but let's handle it just in case + last_models["best_model"] = None + # finally check if last best model is more recent than checkpoint + elif last_model_nums["best_model"] > last_model_nums["checkpoint"]: + last_models["checkpoint"] = last_models["best_model"] - @abstractmethod - def fit(self) -> None: - pass + return last_models["checkpoint"], last_models["best_model"] - @abstractmethod - def save_best_model(self) -> None: - pass - @abstractmethod - def on_epoch_start(self) -> None: - pass +def process_args(args, config=None): + """Process parsed comand line arguments. - @abstractmethod - def on_epoch_end(self) -> None: - pass + Args: + args (argparse.Namespace or dict like): Parsed input arguments. + config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None. - @abstractmethod - def on_train_step_start(self) -> None: - pass + Returns: + c (TTS.utils.io.AttrDict): Config paramaters. + out_path (str): Path to save models and logging. + audio_path (str): Path to save generated test audios. + c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does + logging to the console. + tb_logger (TTS.utils.tensorboard.TensorboardLogger): Class that does + the TensorBoard loggind. + """ + if isinstance(args, tuple): + args, coqpit_overrides = args + if args.continue_path: + # continue a previous training from its output folder + experiment_path = args.continue_path + args.config_path = os.path.join(args.continue_path, "config.json") + args.restore_path, best_model = get_last_checkpoint(args.continue_path) + if not args.best_path: + args.best_path = best_model + # setup output paths and read configs + if config is None: + config = load_config(args.config_path) + # override values from command-line args + config.parse_known_args(coqpit_overrides, relaxed_parser=True) + if config.mixed_precision: + print(" > Mixed precision mode is ON") + experiment_path = args.continue_path + if not experiment_path: + experiment_path = create_experiment_folder(config.output_path, config.run_name) + audio_path = os.path.join(experiment_path, "test_audios") + # setup rank 0 process in distributed training + tb_logger = None + if args.rank == 0: + os.makedirs(audio_path, exist_ok=True) + new_fields = {} + if args.restore_path: + new_fields["restore_path"] = args.restore_path + new_fields["github_branch"] = get_git_branch() + # if model characters are not set in the config file + # save the default set to the config file for future + # compatibility. + if config.has("characters_config"): + used_characters = parse_symbols() + new_fields["characters"] = used_characters + copy_model_files(config, experiment_path, new_fields) + os.chmod(audio_path, 0o775) + os.chmod(experiment_path, 0o775) + tb_logger = TensorboardLogger(experiment_path, model_name=config.model) + # write model desc to tensorboard + tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) + c_logger = ConsoleLogger() + return config, experiment_path, audio_path, c_logger, tb_logger - @abstractmethod - def on_train_step_end(self) -> None: - pass + +def init_training(argv: Union[List, Coqpit], config: Coqpit = None): + """Initialization of a training run.""" + if isinstance(argv, Coqpit): + parser = argv.init_argparse(arg_prefix="") + else: + parser = init_arguments() + args = parser.parse_known_args() + config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(args, config) + return args[0], config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py deleted file mode 100644 index 705ea5bc..00000000 --- a/TTS/tts/models/tacotron_abstract.py +++ /dev/null @@ -1,245 +0,0 @@ -import copy -from abc import ABC, abstractmethod -from typing import Dict - -import torch -from torch import nn - -from TTS.tts.utils.data import sequence_mask -from TTS.utils.generic_utils import format_aux_input -from TTS.utils.training import gradual_training_scheduler - - -class TacotronAbstract(ABC, nn.Module): - def __init__( - self, - num_chars, - num_speakers, - r, - postnet_output_dim=80, - decoder_output_dim=80, - attn_type="original", - attn_win=False, - attn_norm="softmax", - prenet_type="original", - prenet_dropout=True, - prenet_dropout_at_inference=False, - forward_attn=False, - trans_agent=False, - forward_attn_mask=False, - location_attn=True, - attn_K=5, - separate_stopnet=True, - bidirectional_decoder=False, - double_decoder_consistency=False, - ddc_r=None, - encoder_in_features=512, - decoder_in_features=512, - d_vector_dim=None, - use_gst=False, - gst=None, - gradual_training=None, - ): - """Abstract Tacotron class""" - super().__init__() - self.num_chars = num_chars - self.r = r - self.decoder_output_dim = decoder_output_dim - self.postnet_output_dim = postnet_output_dim - self.use_gst = use_gst - self.gst = gst - self.num_speakers = num_speakers - self.bidirectional_decoder = bidirectional_decoder - self.double_decoder_consistency = double_decoder_consistency - self.ddc_r = ddc_r - self.attn_type = attn_type - self.attn_win = attn_win - self.attn_norm = attn_norm - self.prenet_type = prenet_type - self.prenet_dropout = prenet_dropout - self.prenet_dropout_at_inference = prenet_dropout_at_inference - self.forward_attn = forward_attn - self.trans_agent = trans_agent - self.forward_attn_mask = forward_attn_mask - self.location_attn = location_attn - self.attn_K = attn_K - self.separate_stopnet = separate_stopnet - self.encoder_in_features = encoder_in_features - self.decoder_in_features = decoder_in_features - self.d_vector_dim = d_vector_dim - self.gradual_training = gradual_training - - # layers - self.embedding = None - self.encoder = None - self.decoder = None - self.postnet = None - - # multispeaker - if self.d_vector_dim is None: - # if d_vector_dim is None we need use the nn.Embedding, with default d_vector_dim - self.use_d_vectors = False - else: - # if d_vector_dim is not None we need use speaker embedding per sample - self.use_d_vectors = True - - # global style token - if self.gst and use_gst: - self.decoder_in_features += self.gst.gst_embedding_dim # add gst embedding dim - self.gst_layer = None - - # model states - self.embedded_speakers = None - self.embedded_speakers_projected = None - - # additional layers - self.decoder_backward = None - self.coarse_decoder = None - - @staticmethod - def _format_aux_input(aux_input: Dict) -> Dict: - return format_aux_input({"d_vectors": None, "speaker_ids": None}, aux_input) - - ############################# - # INIT FUNCTIONS - ############################# - - def _init_states(self): - self.embedded_speakers = None - self.embedded_speakers_projected = None - - def _init_backward_decoder(self): - self.decoder_backward = copy.deepcopy(self.decoder) - - def _init_coarse_decoder(self): - self.coarse_decoder = copy.deepcopy(self.decoder) - self.coarse_decoder.r_init = self.ddc_r - self.coarse_decoder.set_r(self.ddc_r) - - ############################# - # CORE FUNCTIONS - ############################# - - @abstractmethod - def forward(self): - pass - - @abstractmethod - def inference(self): - pass - - def load_checkpoint( - self, config, checkpoint_path, eval=False - ): # pylint: disable=unused-argument, redefined-builtin - state = torch.load(checkpoint_path, map_location=torch.device("cpu")) - self.load_state_dict(state["model"]) - self.decoder.set_r(state["r"]) - if eval: - self.eval() - assert not self.training - - ############################# - # COMMON COMPUTE FUNCTIONS - ############################# - - def compute_masks(self, text_lengths, mel_lengths): - """Compute masks against sequence paddings.""" - # B x T_in_max (boolean) - input_mask = sequence_mask(text_lengths) - output_mask = None - if mel_lengths is not None: - max_len = mel_lengths.max() - r = self.decoder.r - max_len = max_len + (r - (max_len % r)) if max_len % r > 0 else max_len - output_mask = sequence_mask(mel_lengths, max_len=max_len) - return input_mask, output_mask - - def _backward_pass(self, mel_specs, encoder_outputs, mask): - """Run backwards decoder""" - decoder_outputs_b, alignments_b, _ = self.decoder_backward( - encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask - ) - decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous() - return decoder_outputs_b, alignments_b - - def _coarse_decoder_pass(self, mel_specs, encoder_outputs, alignments, input_mask): - """Double Decoder Consistency""" - T = mel_specs.shape[1] - if T % self.coarse_decoder.r > 0: - padding_size = self.coarse_decoder.r - (T % self.coarse_decoder.r) - mel_specs = torch.nn.functional.pad(mel_specs, (0, 0, 0, padding_size, 0, 0)) - decoder_outputs_backward, alignments_backward, _ = self.coarse_decoder( - encoder_outputs.detach(), mel_specs, input_mask - ) - # scale_factor = self.decoder.r_init / self.decoder.r - alignments_backward = torch.nn.functional.interpolate( - alignments_backward.transpose(1, 2), size=alignments.shape[1], mode="nearest" - ).transpose(1, 2) - decoder_outputs_backward = decoder_outputs_backward.transpose(1, 2) - decoder_outputs_backward = decoder_outputs_backward[:, :T, :] - return decoder_outputs_backward, alignments_backward - - ############################# - # EMBEDDING FUNCTIONS - ############################# - - def compute_speaker_embedding(self, speaker_ids): - """Compute speaker embedding vectors""" - if hasattr(self, "speaker_embedding") and speaker_ids is None: - raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided") - if hasattr(self, "speaker_embedding") and speaker_ids is not None: - self.embedded_speakers = self.speaker_embedding(speaker_ids).unsqueeze(1) - if hasattr(self, "speaker_project_mel") and speaker_ids is not None: - self.embedded_speakers_projected = self.speaker_project_mel(self.embedded_speakers).squeeze(1) - - def compute_gst(self, inputs, style_input, speaker_embedding=None): - """Compute global style token""" - if isinstance(style_input, dict): - query = torch.zeros(1, 1, self.gst.gst_embedding_dim // 2).type_as(inputs) - if speaker_embedding is not None: - query = torch.cat([query, speaker_embedding.reshape(1, 1, -1)], dim=-1) - - _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) - gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).type_as(inputs) - for k_token, v_amplifier in style_input.items(): - key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) - gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) - gst_outputs = gst_outputs + gst_outputs_att * v_amplifier - elif style_input is None: - gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).type_as(inputs) - else: - gst_outputs = self.gst_layer(style_input, speaker_embedding) # pylint: disable=not-callable - inputs = self._concat_speaker_embedding(inputs, gst_outputs) - return inputs - - @staticmethod - def _add_speaker_embedding(outputs, embedded_speakers): - embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1) - outputs = outputs + embedded_speakers_ - return outputs - - @staticmethod - def _concat_speaker_embedding(outputs, embedded_speakers): - embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1) - outputs = torch.cat([outputs, embedded_speakers_], dim=-1) - return outputs - - ############################# - # CALLBACKS - ############################# - - def on_epoch_start(self, trainer): - """Callback for setting values wrt gradual training schedule. - - Args: - trainer (TrainerTTS): TTS trainer object that is used to train this model. - """ - if self.gradual_training: - r, trainer.config.batch_size = gradual_training_scheduler(trainer.total_steps_done, trainer.config) - trainer.config.r = r - self.decoder.set_r(r) - if trainer.config.bidirectional_decoder: - trainer.model.decoder_backward.set_r(r) - trainer.train_loader = trainer.setup_train_dataloader(self.ap, self.model.decoder.r, verbose=True) - trainer.eval_loader = trainer.setup_eval_dataloder(self.ap, self.model.decoder.r) - print(f"\n > Number of output frames: {self.decoder.r}") diff --git a/TTS/tts/trainer_tts.py b/TTS/tts/trainer_tts.py deleted file mode 100644 index 6c900120..00000000 --- a/TTS/tts/trainer_tts.py +++ /dev/null @@ -1,709 +0,0 @@ -# -*- coding: utf-8 -*- - -import importlib -import logging -import os -import time -from argparse import Namespace -from typing import Dict, List, Tuple, Union - -import torch -from coqpit import Coqpit - -# DISTRIBUTED -from torch import nn -from torch.nn.parallel import DistributedDataParallel as DDP_th -from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler - -from TTS.trainer import TrainerAbstract -from TTS.tts.datasets import TTSDataset, load_meta_data -from TTS.tts.layers import setup_loss -from TTS.tts.models import setup_model -from TTS.tts.utils.io import save_best_model, save_checkpoint -from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager -from TTS.tts.utils.synthesis import synthesis -from TTS.tts.utils.text.symbols import make_symbols -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import init_distributed -from TTS.utils.generic_utils import KeepAverage, count_parameters, set_init_dict, to_cuda -from TTS.utils.logging import ConsoleLogger, TensorboardLogger -from TTS.utils.training import check_update, setup_torch_training_env - - -# pylint: disable=import-outside-toplevel, too-many-public-methods - -class TrainerTTS(TrainerAbstract): - use_cuda, num_gpus = setup_torch_training_env(True, False) - - def __init__( - self, - args: Union[Coqpit, Namespace], - config: Coqpit, - c_logger: ConsoleLogger = None, - tb_logger: TensorboardLogger = None, - model: nn.Module = None, - output_path: str = None, - ) -> None: - self.args = args - self.config = config - self.c_logger = ConsoleLogger() if c_logger is None else c_logger - if tb_logger is None: - self.tb_logger = TensorboardLogger(output_path, model_name=config.model) - self.tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) - else: - self.tb_logger = tb_logger - self.output_path = output_path - - self.total_steps_done = 0 - self.epochs_done = 0 - self.restore_step = 0 - self.best_loss = float("inf") - self.train_loader = None - self.eval_loader = None - self.output_audio_path = os.path.join(output_path, "test_audios") - - self.keep_avg_train = None - self.keep_avg_eval = None - - log_file = os.path.join(self.output_path, f"trainer_{args.rank}_log.txt") - self._setup_logger_config(log_file) - - # model, audio processor, datasets, loss - # init audio processor - self.ap = AudioProcessor(**self.config.audio.to_dict()) - - # init character processor - self.model_characters = self.get_character_processor(self.config) - - # load dataset samples - self.data_train, self.data_eval = load_meta_data(self.config.datasets) - - # default speaker manager - self.speaker_manager = self.get_speaker_manager(self.config, args.restore_path, output_path, self.data_train) - - # init TTS model - if model is not None: - self.model = model - else: - self.model = self.get_model( - len(self.model_characters), - self.speaker_manager.num_speakers, - self.config, - self.speaker_manager.d_vector_dim if self.speaker_manager.d_vectors else None, - ) - - # setup criterion - self.criterion = self.get_criterion(self.config) - - # DISTRUBUTED - if self.num_gpus > 1: - init_distributed( - args.rank, - self.num_gpus, - args.group_id, - self.config.distributed_backend, - self.config.distributed_url, - ) - - if self.use_cuda: - self.model.cuda() - self.criterion.cuda() - - # scalers for mixed precision training - self.scaler = torch.cuda.amp.GradScaler() if self.config.mixed_precision and self.use_cuda else None - - # setup optimizer - self.optimizer = self.get_optimizer(self.model, self.config) - - if self.args.restore_path: - self.model, self.optimizer, self.scaler, self.restore_step = self.restore_model( - self.config, args.restore_path, self.model, self.optimizer, self.scaler - ) - - # setup scheduler - self.scheduler = self.get_scheduler(self.config, self.optimizer) - - # DISTRUBUTED - if self.num_gpus > 1: - self.model = DDP_th(self.model, device_ids=[args.rank], output_device=args.rank) - - # count model size - num_params = count_parameters(self.model) - print("\n > Model has {} parameters".format(num_params)) - - @staticmethod - def get_model(num_chars: int, num_speakers: int, config: Coqpit, d_vector_dim: int) -> nn.Module: - model = setup_model(num_chars, num_speakers, config, d_vector_dim) - return model - - @staticmethod - def get_optimizer(model: nn.Module, config: Coqpit) -> torch.optim.Optimizer: - optimizer_name = config.optimizer - optimizer_params = config.optimizer_params - if optimizer_name.lower() == "radam": - module = importlib.import_module("TTS.utils.radam") - optimizer = getattr(module, "RAdam") - else: - optimizer = getattr(torch.optim, optimizer_name) - return optimizer(model.parameters(), lr=config.lr, **optimizer_params) - - @staticmethod - def get_character_processor(config: Coqpit) -> str: - # setup custom characters if set in config file. - # TODO: implement CharacterProcessor - if config.characters is not None: - symbols, phonemes = make_symbols(**config.characters.to_dict()) - else: - from TTS.tts.utils.text.symbols import phonemes, symbols - model_characters = phonemes if config.use_phonemes else symbols - return model_characters - - @staticmethod - def get_speaker_manager( - config: Coqpit, restore_path: str = "", out_path: str = "", data_train: List = None - ) -> SpeakerManager: - speaker_manager = get_speaker_manager(config, restore_path, data_train, out_path) - return speaker_manager - - @staticmethod - def get_scheduler( - config: Coqpit, optimizer: torch.optim.Optimizer - ) -> torch.optim.lr_scheduler._LRScheduler: # pylint: disable=protected-access - lr_scheduler = config.lr_scheduler - lr_scheduler_params = config.lr_scheduler_params - if lr_scheduler is None: - return None - if lr_scheduler.lower() == "noamlr": - from TTS.utils.training import NoamLR - - scheduler = NoamLR - else: - scheduler = getattr(torch.optim, lr_scheduler) - return scheduler(optimizer, **lr_scheduler_params) - - @staticmethod - def get_criterion(config: Coqpit) -> nn.Module: - return setup_loss(config) - - def restore_model( - self, - config: Coqpit, - restore_path: str, - model: nn.Module, - optimizer: torch.optim.Optimizer, - scaler: torch.cuda.amp.GradScaler = None, - ) -> Tuple[nn.Module, torch.optim.Optimizer, torch.cuda.amp.GradScaler, int]: - print(" > Restoring from %s ..." % os.path.basename(restore_path)) - checkpoint = torch.load(restore_path) - try: - print(" > Restoring Model...") - model.load_state_dict(checkpoint["model"]) - print(" > Restoring Optimizer...") - optimizer.load_state_dict(checkpoint["optimizer"]) - if "scaler" in checkpoint and config.mixed_precision: - print(" > Restoring AMP Scaler...") - scaler.load_state_dict(checkpoint["scaler"]) - except (KeyError, RuntimeError): - print(" > Partial model initialization...") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], config) - model.load_state_dict(model_dict) - del model_dict - - for group in optimizer.param_groups: - group["lr"] = self.config.lr - print( - " > Model restored from step %d" % checkpoint["step"], - ) - restore_step = checkpoint["step"] - return model, optimizer, scaler, restore_step - - def _get_loader( - self, - r: int, - ap: AudioProcessor, - is_eval: bool, - data_items: List, - verbose: bool, - speaker_ids: Union[Dict, List], - d_vectors: Union[Dict, List], - ) -> DataLoader: - if is_eval and not self.config.run_eval: - loader = None - else: - dataset = TTSDataset( - outputs_per_step=r, - text_cleaner=self.config.text_cleaner, - compute_linear_spec=self.config.model.lower() == "tacotron", - meta_data=data_items, - ap=ap, - tp=self.config.characters, - add_blank=self.config["add_blank"], - batch_group_size=0 if is_eval else self.config.batch_group_size * self.config.batch_size, - min_seq_len=self.config.min_seq_len, - max_seq_len=self.config.max_seq_len, - phoneme_cache_path=self.config.phoneme_cache_path, - use_phonemes=self.config.use_phonemes, - phoneme_language=self.config.phoneme_language, - enable_eos_bos=self.config.enable_eos_bos_chars, - use_noise_augment=not is_eval, - verbose=verbose, - speaker_id_mapping=speaker_ids if self.config.use_speaker_embedding else None, - d_vector_mapping=d_vectors - if self.config.use_speaker_embedding and self.config.use_external_speaker_embedding_file - else None, - ) - - if self.config.use_phonemes and self.config.compute_input_seq_cache: - # precompute phonemes to have a better estimate of sequence lengths. - dataset.compute_input_seq(self.config.num_loader_workers) - dataset.sort_items() - - sampler = DistributedSampler(dataset) if self.num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=self.config.eval_batch_size if is_eval else self.config.batch_size, - shuffle=False, - collate_fn=dataset.collate_fn, - drop_last=False, - sampler=sampler, - num_workers=self.config.num_val_loader_workers if is_eval else self.config.num_loader_workers, - pin_memory=False, - ) - return loader - - def get_train_dataloader( - self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_ids: Dict, d_vectors: Dict - ) -> DataLoader: - return self._get_loader(r, ap, False, data_items, verbose, speaker_ids, d_vectors) - - def get_eval_dataloder( - self, r: int, ap: AudioProcessor, data_items: List, verbose: bool, speaker_ids: Dict, d_vectors: Dict - ) -> DataLoader: - return self._get_loader(r, ap, True, data_items, verbose, speaker_ids, d_vectors) - - def format_batch(self, batch: List) -> Dict: - # setup input batch - text_input = batch[0] - text_lengths = batch[1] - speaker_names = batch[2] - linear_input = batch[3] if self.config.model.lower() in ["tacotron"] else None - mel_input = batch[4] - mel_lengths = batch[5] - stop_targets = batch[6] - item_idx = batch[7] - d_vectors = batch[8] - speaker_ids = batch[9] - attn_mask = batch[10] - max_text_length = torch.max(text_lengths.float()) - max_spec_length = torch.max(mel_lengths.float()) - - # compute durations from attention masks - durations = None - if attn_mask is not None: - durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2]) - for idx, am in enumerate(attn_mask): - # compute raw durations - c_idxs = am[:, : text_lengths[idx], : mel_lengths[idx]].max(1)[1] - # c_idxs, counts = torch.unique_consecutive(c_idxs, return_counts=True) - c_idxs, counts = torch.unique(c_idxs, return_counts=True) - dur = torch.ones([text_lengths[idx]]).to(counts.dtype) - dur[c_idxs] = counts - # smooth the durations and set any 0 duration to 1 - # by cutting off from the largest duration indeces. - extra_frames = dur.sum() - mel_lengths[idx] - largest_idxs = torch.argsort(-dur)[:extra_frames] - dur[largest_idxs] -= 1 - assert ( - dur.sum() == mel_lengths[idx] - ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" - durations[idx, : text_lengths[idx]] = dur - - # set stop targets view, we predict a single stop token per iteration. - stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // self.config.r, -1) - stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) - - # dispatch batch to GPU - if self.use_cuda: - text_input = to_cuda(text_input) - text_lengths = to_cuda(text_lengths) - mel_input = to_cuda(mel_input) - mel_lengths = to_cuda(mel_lengths) - linear_input = to_cuda(linear_input) if self.config.model.lower() in ["tacotron"] else None - stop_targets = to_cuda(stop_targets) - attn_mask = to_cuda(attn_mask) if attn_mask is not None else None - durations = to_cuda(durations) if attn_mask is not None else None - if speaker_ids is not None: - speaker_ids = to_cuda(speaker_ids) - if d_vectors is not None: - d_vectors = to_cuda(d_vectors) - - return { - "text_input": text_input, - "text_lengths": text_lengths, - "speaker_names": speaker_names, - "mel_input": mel_input, - "mel_lengths": mel_lengths, - "linear_input": linear_input, - "stop_targets": stop_targets, - "attn_mask": attn_mask, - "durations": durations, - "speaker_ids": speaker_ids, - "d_vectors": d_vectors, - "max_text_length": max_text_length, - "max_spec_length": max_spec_length, - "item_idx": item_idx, - } - - def _train_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: - if hasattr(self.model, "module"): - return self.model.module.train_step(batch, criterion) - return self.model.train_step(batch, criterion) - - def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_time: float) -> Tuple[Dict, Dict]: - self.on_train_step_start() - step_start_time = time.time() - - # format data - batch = self.format_batch(batch) - loader_time = time.time() - loader_start_time - - # zero-out optimizer - self.optimizer.zero_grad() - - with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): - outputs, loss_dict = self._train_step(batch, self.criterion) - - # check nan loss - if torch.isnan(loss_dict["loss"]).any(): - raise RuntimeError(f"Detected NaN loss at step {self.total_steps_done}.") - - # optimizer step - if self.config.mixed_precision: - # model optimizer step in mixed precision mode - self.scaler.scale(loss_dict["loss"]).backward() - self.scaler.unscale_(self.optimizer) - grad_norm, _ = check_update(self.model, self.config.grad_clip, ignore_stopnet=True) - self.scaler.step(self.optimizer) - self.scaler.update() - else: - # main model optimizer step - loss_dict["loss"].backward() - grad_norm, _ = check_update(self.model, self.config.grad_clip, ignore_stopnet=True) - self.optimizer.step() - - step_time = time.time() - step_start_time - - # setup lr - if self.config.lr_scheduler: - self.scheduler.step() - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - self.keep_avg_train.update_values(update_train_values) - - # print training progress - current_lr = self.optimizer.param_groups[0]["lr"] - if self.total_steps_done % self.config.print_step == 0: - log_dict = { - "max_spec_length": [batch["max_spec_length"], 1], # value, precision - "max_text_length": [batch["max_text_length"], 1], - "step_time": [step_time, 4], - "loader_time": [loader_time, 2], - "current_lr": current_lr, - } - self.c_logger.print_train_step( - batch_n_steps, step, self.total_steps_done, log_dict, loss_dict, self.keep_avg_train.avg_values - ) - - if self.args.rank == 0: - # Plot Training Iter Stats - # reduce TB load - if self.total_steps_done % self.config.tb_plot_step == 0: - iter_stats = { - "lr": current_lr, - "grad_norm": grad_norm, - "step_time": step_time, - } - iter_stats.update(loss_dict) - self.tb_logger.tb_train_step_stats(self.total_steps_done, iter_stats) - - if self.total_steps_done % self.config.save_step == 0: - if self.config.checkpoint: - # save model - save_checkpoint( - self.model, - self.optimizer, - self.total_steps_done, - self.epochs_done, - self.config.r, - self.output_path, - model_loss=loss_dict["loss"], - characters=self.model_characters, - scaler=self.scaler.state_dict() if self.config.mixed_precision else None, - ) - # training visualizations - if hasattr(self.model, "module"): - figures, audios = self.model.module.train_log(self.ap, batch, outputs) - else: - figures, audios = self.model.train_log(self.ap, batch, outputs) - self.tb_logger.tb_train_figures(self.total_steps_done, figures) - self.tb_logger.tb_train_audios(self.total_steps_done, {"TrainAudio": audios}, self.ap.sample_rate) - self.total_steps_done += 1 - self.on_train_step_end() - return outputs, loss_dict - - def train_epoch(self) -> None: - self.model.train() - epoch_start_time = time.time() - if self.use_cuda: - batch_num_steps = int(len(self.train_loader.dataset) / (self.config.batch_size * self.num_gpus)) - else: - batch_num_steps = int(len(self.train_loader.dataset) / self.config.batch_size) - self.c_logger.print_train_start() - for cur_step, batch in enumerate(self.train_loader): - loader_start_time = time.time() - _, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time) - epoch_time = time.time() - epoch_start_time - # Plot self.epochs_done Stats - if self.args.rank == 0: - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(self.keep_avg_train.avg_values) - self.tb_logger.tb_train_epoch_stats(self.total_steps_done, epoch_stats) - if self.config.tb_model_param_stats: - self.tb_logger.tb_model_weights(self.model, self.total_steps_done) - - def _eval_step(self, batch: Dict) -> Tuple[Dict, Dict]: - if hasattr(self.model, "module"): - return self.model.module.eval_step(batch, self.criterion) - return self.model.eval_step(batch, self.criterion) - - def eval_step(self, batch: Dict, step: int) -> Tuple[Dict, Dict]: - with torch.no_grad(): - step_start_time = time.time() - - with torch.cuda.amp.autocast(enabled=self.config.mixed_precision): - outputs, loss_dict = self._eval_step(batch) - - step_time = time.time() - step_start_time - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_eval_values = dict() - for key, value in loss_dict.items(): - update_eval_values["avg_" + key] = value - update_eval_values["avg_step_time"] = step_time - self.keep_avg_eval.update_values(update_eval_values) - - if self.config.print_eval: - self.c_logger.print_eval_step(step, loss_dict, self.keep_avg_eval.avg_values) - return outputs, loss_dict - - def eval_epoch(self) -> None: - self.model.eval() - self.c_logger.print_eval_start() - loader_start_time = time.time() - batch = None - for cur_step, batch in enumerate(self.eval_loader): - # format data - batch = self.format_batch(batch) - loader_time = time.time() - loader_start_time - self.keep_avg_eval.update_values({"avg_loader_time": loader_time}) - outputs, _ = self.eval_step(batch, cur_step) - # Plot epoch stats and samples from the last batch. - if self.args.rank == 0: - if hasattr(self.model, "module"): - figures, eval_audios = self.model.module.eval_log(self.ap, batch, outputs) - else: - figures, eval_audios = self.model.eval_log(self.ap, batch, outputs) - self.tb_logger.tb_eval_figures(self.total_steps_done, figures) - self.tb_logger.tb_eval_audios(self.total_steps_done, {"EvalAudio": eval_audios}, self.ap.sample_rate) - - def test_run( - self, - ) -> None: - print(" | > Synthesizing test sentences.") - test_audios = {} - test_figures = {} - test_sentences = self.config.test_sentences - aux_inputs = self._get_aux_inputs() - for idx, sen in enumerate(test_sentences): - wav, alignment, model_outputs, _ = synthesis( - self.model, - sen, - self.config, - self.use_cuda, - self.ap, - speaker_id=aux_inputs["speaker_id"], - d_vector=aux_inputs["d_vector"], - style_wav=aux_inputs["style_wav"], - enable_eos_bos_chars=self.config.enable_eos_bos_chars, - use_griffin_lim=True, - do_trim_silence=False, - ).values() - - file_path = os.path.join(self.output_audio_path, str(self.total_steps_done)) - os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) - self.ap.save_wav(wav, file_path) - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-prediction".format(idx)] = plot_spectrogram(model_outputs, self.ap, output_fig=False) - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment, output_fig=False) - - self.tb_logger.tb_test_audios(self.total_steps_done, test_audios, self.config.audio["sample_rate"]) - self.tb_logger.tb_test_figures(self.total_steps_done, test_figures) - - def _get_aux_inputs(self) -> Dict: - # setup speaker_id - speaker_id = 0 if self.config.use_speaker_embedding else None - # setup d_vector - d_vector = ( - self.speaker_manager.get_d_vectors_by_speaker(self.speaker_manager.speaker_names[0]) - if self.config.use_external_speaker_embedding_file and self.config.use_speaker_embedding - else None - ) - # setup style_mel - if self.config.has("gst_style_input"): - style_wav = self.config.gst_style_input - else: - style_wav = None - if style_wav is None and "use_gst" in self.config and self.config.use_gst: - # inicialize GST with zero dict. - style_wav = {} - print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") - for i in range(self.config.gst["gst_num_style_tokens"]): - style_wav[str(i)] = 0 - aux_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector} - return aux_inputs - - def fit(self) -> None: - if self.restore_step != 0 or self.args.best_path: - print(" > Restoring best loss from " f"{os.path.basename(self.args.best_path)} ...") - self.best_loss = torch.load(self.args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {self.best_loss}.") - - # define data loaders - self.train_loader = self.get_train_dataloader( - self.config.r, - self.ap, - self.data_train, - verbose=True, - speaker_ids=self.speaker_manager.speaker_ids, - d_vectors=self.speaker_manager.d_vectors, - ) - self.eval_loader = ( - self.get_eval_dataloder( - self.config.r, - self.ap, - self.data_train, - verbose=True, - speaker_ids=self.speaker_manager.speaker_ids, - d_vectors=self.speaker_manager.d_vectors, - ) - if self.config.run_eval - else None - ) - - self.total_steps_done = self.restore_step - - for epoch in range(0, self.config.epochs): - self.on_epoch_start() - self.keep_avg_train = KeepAverage() - self.keep_avg_eval = KeepAverage() if self.config.run_eval else None - self.epochs_done = epoch - self.c_logger.print_epoch_start(epoch, self.config.epochs) - self.train_epoch() - if self.config.run_eval: - self.eval_epoch() - if epoch >= self.config.test_delay_epochs and self.args.rank < 0: - self.test_run() - self.c_logger.print_epoch_end( - epoch, self.keep_avg_eval.avg_values if self.config.run_eval else self.keep_avg_train.avg_values - ) - self.save_best_model() - self.on_epoch_end() - - def save_best_model(self) -> None: - self.best_loss = save_best_model( - self.keep_avg_eval["avg_loss"] if self.keep_avg_eval else self.keep_avg_train["avg_loss"], - self.best_loss, - self.model, - self.optimizer, - self.total_steps_done, - self.epochs_done, - self.config.r, - self.output_path, - self.model_characters, - keep_all_best=self.config.keep_all_best, - keep_after=self.config.keep_after, - scaler=self.scaler.state_dict() if self.config.mixed_precision else None, - ) - - @staticmethod - def _setup_logger_config(log_file: str) -> None: - logging.basicConfig( - level=logging.INFO, format="", handlers=[logging.FileHandler(log_file), logging.StreamHandler()] - ) - - def on_epoch_start(self) -> None: # pylint: disable=no-self-use - if hasattr(self.model, "on_epoch_start"): - self.model.on_epoch_start(self) - - if hasattr(self.criterion, "on_epoch_start"): - self.criterion.on_epoch_start(self) - - if hasattr(self.optimizer, "on_epoch_start"): - self.optimizer.on_epoch_start(self) - - def on_epoch_end(self) -> None: # pylint: disable=no-self-use - if hasattr(self.model, "on_epoch_end"): - self.model.on_epoch_end(self) - - if hasattr(self.criterion, "on_epoch_end"): - self.criterion.on_epoch_end(self) - - if hasattr(self.optimizer, "on_epoch_end"): - self.optimizer.on_epoch_end(self) - - def on_train_step_start(self) -> None: # pylint: disable=no-self-use - if hasattr(self.model, "on_train_step_start"): - self.model.on_train_step_start(self) - - if hasattr(self.criterion, "on_train_step_start"): - self.criterion.on_train_step_start(self) - - if hasattr(self.optimizer, "on_train_step_start"): - self.optimizer.on_train_step_start(self) - - def on_train_step_end(self) -> None: # pylint: disable=no-self-use - if hasattr(self.model, "on_train_step_end"): - self.model.on_train_step_end(self) - - if hasattr(self.criterion, "on_train_step_end"): - self.criterion.on_train_step_end(self) - - if hasattr(self.optimizer, "on_train_step_end"): - self.optimizer.on_train_step_end(self) diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py deleted file mode 100644 index 9d92ae82..00000000 --- a/TTS/utils/arguments.py +++ /dev/null @@ -1,182 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -"""Argument parser for training scripts.""" - -import argparse -import glob -import os -import re - -import torch - -from TTS.config import load_config -from TTS.tts.utils.text.symbols import parse_symbols -from TTS.utils.generic_utils import create_experiment_folder, get_git_branch -from TTS.utils.io import copy_model_files -from TTS.utils.logging import ConsoleLogger, TensorboardLogger - - -def init_arguments(argv): - """Parse command line arguments of training scripts. - - Args: - argv (list): This is a list of input arguments as given by sys.argv - - Returns: - argparse.Namespace: Parsed arguments. - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "--continue_path", - type=str, - help=( - "Training output folder to continue training. Used to continue " - "a training. If it is used, 'config_path' is ignored." - ), - default="", - required="--config_path" not in argv, - ) - parser.add_argument( - "--restore_path", type=str, help="Model file to be restored. Use to finetune a model.", default="" - ) - parser.add_argument( - "--best_path", - type=str, - help=( - "Best model file to be used for extracting best loss." - "If not specified, the latest best model in continue path is used" - ), - default="", - ) - parser.add_argument( - "--config_path", type=str, help="Path to config file for training.", required="--continue_path" not in argv - ) - parser.add_argument("--debug", type=bool, default=False, help="Do not verify commit integrity to run training.") - parser.add_argument("--rank", type=int, default=0, help="DISTRIBUTED: process rank for distributed training.") - parser.add_argument("--group_id", type=str, default="", help="DISTRIBUTED: process group id.") - - return parser - - -def get_last_checkpoint(path): - """Get latest checkpoint or/and best model in path. - - It is based on globbing for `*.pth.tar` and the RegEx - `(checkpoint|best_model)_([0-9]+)`. - - Args: - path (list): Path to files to be compared. - - Raises: - ValueError: If no checkpoint or best_model files are found. - - Returns: - last_checkpoint (str): Last checkpoint filename. - """ - file_names = glob.glob(os.path.join(path, "*.pth.tar")) - last_models = {} - last_model_nums = {} - for key in ["checkpoint", "best_model"]: - last_model_num = None - last_model = None - # pass all the checkpoint files and find - # the one with the largest model number suffix. - for file_name in file_names: - match = re.search(f"{key}_([0-9]+)", file_name) - if match is not None: - model_num = int(match.groups()[0]) - if last_model_num is None or model_num > last_model_num: - last_model_num = model_num - last_model = file_name - - # if there is not checkpoint found above - # find the checkpoint with the latest - # modification date. - key_file_names = [fn for fn in file_names if key in fn] - if last_model is None and len(key_file_names) > 0: - last_model = max(key_file_names, key=os.path.getctime) - last_model_num = torch.load(last_model)["step"] - - if last_model is not None: - last_models[key] = last_model - last_model_nums[key] = last_model_num - - # check what models were found - if not last_models: - raise ValueError(f"No models found in continue path {path}!") - if "checkpoint" not in last_models: # no checkpoint just best model - last_models["checkpoint"] = last_models["best_model"] - elif "best_model" not in last_models: # no best model - # this shouldn't happen, but let's handle it just in case - last_models["best_model"] = None - # finally check if last best model is more recent than checkpoint - elif last_model_nums["best_model"] > last_model_nums["checkpoint"]: - last_models["checkpoint"] = last_models["best_model"] - - return last_models["checkpoint"], last_models["best_model"] - - -def process_args(args): - """Process parsed comand line arguments. - - Args: - args (argparse.Namespace or dict like): Parsed input arguments. - - Returns: - c (TTS.utils.io.AttrDict): Config paramaters. - out_path (str): Path to save models and logging. - audio_path (str): Path to save generated test audios. - c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does - logging to the console. - tb_logger (TTS.utils.tensorboard.TensorboardLogger): Class that does - the TensorBoard loggind. - """ - if isinstance(args, tuple): - args, coqpit_overrides = args - if args.continue_path: - # continue a previous training from its output folder - experiment_path = args.continue_path - args.config_path = os.path.join(args.continue_path, "config.json") - args.restore_path, best_model = get_last_checkpoint(args.continue_path) - if not args.best_path: - args.best_path = best_model - # setup output paths and read configs - config = load_config(args.config_path) - # override values from command-line args - config.parse_known_args(coqpit_overrides, relaxed_parser=True) - if config.mixed_precision: - print(" > Mixed precision mode is ON") - experiment_path = args.continue_path - if not experiment_path: - experiment_path = create_experiment_folder(config.output_path, config.run_name, args.debug) - audio_path = os.path.join(experiment_path, "test_audios") - # setup rank 0 process in distributed training - tb_logger = None - if args.rank == 0: - os.makedirs(audio_path, exist_ok=True) - new_fields = {} - if args.restore_path: - new_fields["restore_path"] = args.restore_path - new_fields["github_branch"] = get_git_branch() - # if model characters are not set in the config file - # save the default set to the config file for future - # compatibility. - if config.has("characters_config"): - used_characters = parse_symbols() - new_fields["characters"] = used_characters - copy_model_files(config, experiment_path, new_fields) - os.chmod(audio_path, 0o775) - os.chmod(experiment_path, 0o775) - tb_logger = TensorboardLogger(experiment_path, model_name=config.model) - # write model desc to tensorboard - tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) - c_logger = ConsoleLogger() - return config, experiment_path, audio_path, c_logger, tb_logger - - -def init_training(argv): - """Initialization of a training run.""" - parser = init_arguments(argv) - args = parser.parse_known_args() - config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(args) - return args[0], config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger diff --git a/TTS/utils/callbacks.py b/TTS/utils/callbacks.py new file mode 100644 index 00000000..18b6c34c --- /dev/null +++ b/TTS/utils/callbacks.py @@ -0,0 +1,75 @@ +class TrainerCallback: + def __init__(self, trainer): + super().__init__() + self.trainer = trainer + + def on_init_start(self) -> None: + if hasattr(self.trainer.model, "on_init_start"): + self.trainer.model.on_init_start(self.trainer) + + if hasattr(self.trainer.criterion, "on_init_start"): + self.trainer.criterion.on_init_start(self.trainer) + + if hasattr(self.trainer.optimizer, "on_init_start"): + self.trainer.optimizer.on_init_start(self.trainer) + + def on_init_end(self) -> None: + if hasattr(self.trainer.model, "on_init_end"): + self.trainer.model.on_init_end(self.trainer) + + if hasattr(self.trainer.criterion, "on_init_end"): + self.trainer.criterion.on_init_end(self.trainer) + + if hasattr(self.trainer.optimizer, "on_init_end"): + self.trainer.optimizer.on_init_end(self.trainer) + + def on_epoch_start(self) -> None: + if hasattr(self.trainer.model, "on_epoch_start"): + self.trainer.model.on_epoch_start(self.trainer) + + if hasattr(self.trainer.criterion, "on_epoch_start"): + self.trainer.criterion.on_epoch_start(self.trainer) + + if hasattr(self.trainer.optimizer, "on_epoch_start"): + self.trainer.optimizer.on_epoch_start(self.trainer) + + def on_epoch_end(self) -> None: + if hasattr(self.trainer.model, "on_epoch_end"): + self.trainer.model.on_epoch_end(self.trainer) + + if hasattr(self.trainer.criterion, "on_epoch_end"): + self.trainer.criterion.on_epoch_end(self.trainer) + + if hasattr(self.trainer.optimizer, "on_epoch_end"): + self.trainer.optimizer.on_epoch_end(self.trainer) + + def on_train_step_start(self) -> None: + if hasattr(self.trainer.model, "on_train_step_start"): + self.trainer.model.on_train_step_start(self.trainer) + + if hasattr(self.trainer.criterion, "on_train_step_start"): + self.trainer.criterion.on_train_step_start(self.trainer) + + if hasattr(self.trainer.optimizer, "on_train_step_start"): + self.trainer.optimizer.on_train_step_start(self.trainer) + + def on_train_step_end(self) -> None: + + if hasattr(self.trainer.model, "on_train_step_end"): + self.trainer.model.on_train_step_end(self.trainer) + + if hasattr(self.trainer.criterion, "on_train_step_end"): + self.trainer.criterion.on_train_step_end(self.trainer) + + if hasattr(self.trainer.optimizer, "on_train_step_end"): + self.trainer.optimizer.on_train_step_end(self.trainer) + + def on_keyboard_interrupt(self) -> None: + if hasattr(self.trainer.model, "on_keyboard_interrupt"): + self.trainer.model.on_keyboard_interrupt(self.trainer) + + if hasattr(self.trainer.criterion, "on_keyboard_interrupt"): + self.trainer.criterion.on_keyboard_interrupt(self.trainer) + + if hasattr(self.trainer.optimizer, "on_keyboard_interrupt"): + self.trainer.optimizer.on_keyboard_interrupt(self.trainer) diff --git a/TTS/utils/distribute.py b/TTS/utils/distribute.py index 7a1078e8..1c6b0e1c 100644 --- a/TTS/utils/distribute.py +++ b/TTS/utils/distribute.py @@ -1,53 +1,8 @@ # edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py -import math - import torch import torch.distributed as dist from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors from torch.autograd import Variable -from torch.utils.data.sampler import Sampler - - -class DistributedSampler(Sampler): - """ - Non shuffling Distributed Sampler - """ - - def __init__(self, dataset, num_replicas=None, rank=None): - super().__init__(dataset) - if num_replicas is None: - if not dist.is_available(): - raise RuntimeError("Requires distributed package to be available") - num_replicas = dist.get_world_size() - if rank is None: - if not dist.is_available(): - raise RuntimeError("Requires distributed package to be available") - rank = dist.get_rank() - self.dataset = dataset - self.num_replicas = num_replicas - self.rank = rank - self.epoch = 0 - self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) - self.total_size = self.num_samples * self.num_replicas - - def __iter__(self): - indices = torch.arange(len(self.dataset)).tolist() - - # add extra samples to make it evenly divisible - indices += indices[: (self.total_size - len(indices))] - assert len(indices) == self.total_size - - # subsample - indices = indices[self.rank : self.total_size : self.num_replicas] - assert len(indices) == self.num_samples - - return iter(indices) - - def __len__(self): - return self.num_samples - - def set_epoch(self, epoch): - self.epoch = epoch def reduce_tensor(tensor, num_gpus): diff --git a/TTS/utils/trainer_utils.py b/TTS/utils/trainer_utils.py new file mode 100644 index 00000000..02e68905 --- /dev/null +++ b/TTS/utils/trainer_utils.py @@ -0,0 +1,65 @@ +import importlib +from typing import Dict + +import torch + +from TTS.utils.training import NoamLR + + +def is_apex_available(): + return importlib.util.find_spec("apex") is not None + + +def setup_torch_training_env(cudnn_enable, cudnn_benchmark): + torch.backends.cudnn.enabled = cudnn_enable + torch.backends.cudnn.benchmark = cudnn_benchmark + torch.manual_seed(54321) + use_cuda = torch.cuda.is_available() + num_gpus = torch.cuda.device_count() + print(" > Using CUDA: ", use_cuda) + print(" > Number of GPUs: ", num_gpus) + return use_cuda, num_gpus + + +def get_scheduler( + lr_scheduler: str, lr_scheduler_params: Dict, optimizer: torch.optim.Optimizer +) -> torch.optim.lr_scheduler._LRScheduler: # pylint: disable=protected-access + """Find, initialize and return a scheduler. + + Args: + lr_scheduler (str): Scheduler name. + lr_scheduler_params (Dict): Scheduler parameters. + optimizer (torch.optim.Optimizer): Optimizer to pass to the scheduler. + + Returns: + torch.optim.lr_scheduler._LRScheduler: Functional scheduler. + """ + if lr_scheduler is None: + return None + if lr_scheduler.lower() == "noamlr": + scheduler = NoamLR + else: + scheduler = getattr(torch.optim.lr_scheduler, lr_scheduler) + return scheduler(optimizer, **lr_scheduler_params) + + +def get_optimizer( + optimizer_name: str, optimizer_params: dict, lr: float, model: torch.nn.Module +) -> torch.optim.Optimizer: + """Find, initialize and return a optimizer. + + Args: + optimizer_name (str): Optimizer name. + optimizer_params (dict): Optimizer parameters. + lr (float): Initial learning rate. + model (torch.nn.Module): Model to pass to the optimizer. + + Returns: + torch.optim.Optimizer: Functional optimizer. + """ + if optimizer_name.lower() == "radam": + module = importlib.import_module("TTS.utils.radam") + optimizer = getattr(module, "RAdam") + else: + optimizer = getattr(torch.optim, optimizer_name) + return optimizer(model.parameters(), lr=lr, **optimizer_params) diff --git a/TTS/utils/training.py b/TTS/utils/training.py index 37b32637..aa5651c5 100644 --- a/TTS/utils/training.py +++ b/TTS/utils/training.py @@ -2,17 +2,6 @@ import numpy as np import torch -def setup_torch_training_env(cudnn_enable, cudnn_benchmark): - torch.backends.cudnn.enabled = cudnn_enable - torch.backends.cudnn.benchmark = cudnn_benchmark - torch.manual_seed(54321) - use_cuda = torch.cuda.is_available() - num_gpus = torch.cuda.device_count() - print(" > Using CUDA: ", use_cuda) - print(" > Number of GPUs: ", num_gpus) - return use_cuda, num_gpus - - def check_update(model, grad_clip, ignore_stopnet=False, amp_opt_params=None): r"""Check model gradient against unexpected jumps and failures""" skip_flag = False @@ -41,46 +30,6 @@ def check_update(model, grad_clip, ignore_stopnet=False, amp_opt_params=None): return grad_norm, skip_flag -def lr_decay(init_lr, global_step, warmup_steps): - r"""from https://github.com/r9y9/tacotron_pytorch/blob/master/train.py""" - warmup_steps = float(warmup_steps) - step = global_step + 1.0 - lr = init_lr * warmup_steps ** 0.5 * np.minimum(step * warmup_steps ** -1.5, step ** -0.5) - return lr - - -def adam_weight_decay(optimizer): - """ - Custom weight decay operation, not effecting grad values. - """ - for group in optimizer.param_groups: - for param in group["params"]: - current_lr = group["lr"] - weight_decay = group["weight_decay"] - factor = -weight_decay * group["lr"] - param.data = param.data.add(param.data, alpha=factor) - return optimizer, current_lr - - -# pylint: disable=dangerous-default-value -def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v", "rnn", "lstm", "gru", "embedding"}): - """ - Skip biases, BatchNorm parameters, rnns. - and attention projection layer v - """ - decay = [] - no_decay = [] - for name, param in model.named_parameters(): - if not param.requires_grad: - continue - - if len(param.shape) == 1 or any((skip_name in name for skip_name in skip_list)): - no_decay.append(param) - else: - decay.append(param) - return [{"params": no_decay, "weight_decay": 0.0}, {"params": decay, "weight_decay": weight_decay}] - - # pylint: disable=protected-access class NoamLR(torch.optim.lr_scheduler._LRScheduler): def __init__(self, optimizer, warmup_steps=0.1, last_epoch=-1): @@ -107,3 +56,31 @@ def gradual_training_scheduler(global_step, config): if global_step * num_gpus >= values[0]: new_values = values return new_values[1], new_values[2] + + +def lr_decay(init_lr, global_step, warmup_steps): + r"""from https://github.com/r9y9/tacotron_pytorch/blob/master/train.py + It is only being used by the Speaker Encoder trainer.""" + warmup_steps = float(warmup_steps) + step = global_step + 1.0 + lr = init_lr * warmup_steps ** 0.5 * np.minimum(step * warmup_steps ** -1.5, step ** -0.5) + return lr + + +# pylint: disable=dangerous-default-value +def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v", "rnn", "lstm", "gru", "embedding"}): + """ + Skip biases, BatchNorm parameters, rnns. + and attention projection layer v + """ + decay = [] + no_decay = [] + for name, param in model.named_parameters(): + if not param.requires_grad: + continue + + if len(param.shape) == 1 or any((skip_name in name for skip_name in skip_list)): + no_decay.append(param) + else: + decay.append(param) + return [{"params": no_decay, "weight_decay": 0.0}, {"params": decay, "weight_decay": weight_decay}] From 98298ee671bfb09f1f06c73f785792f00f189a85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:23:08 +0200 Subject: [PATCH 193/258] Implement unified IO utils --- TTS/tts/utils/io.py | 120 ------------------------------------- TTS/utils/io.py | 121 +++++++++++++++++++++++++++++++++++++ TTS/vocoder/utils/io.py | 128 ---------------------------------------- 3 files changed, 121 insertions(+), 248 deletions(-) delete mode 100644 TTS/tts/utils/io.py delete mode 100644 TTS/vocoder/utils/io.py diff --git a/TTS/tts/utils/io.py b/TTS/tts/utils/io.py deleted file mode 100644 index bb8432fa..00000000 --- a/TTS/tts/utils/io.py +++ /dev/null @@ -1,120 +0,0 @@ -import datetime -import os -import pickle as pickle_tts - -import torch - -from TTS.utils.io import RenamingUnpickler - - -def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False, eval=False): # pylint: disable=redefined-builtin - """Load ```TTS.tts.models``` checkpoints. - - Args: - model (TTS.tts.models): model object to load the weights for. - checkpoint_path (string): checkpoint file path. - amp (apex.amp, optional): Apex amp abject to load apex related state vars. Defaults to None. - use_cuda (bool, optional): load model to GPU if True. Defaults to False. - - Returns: - [type]: [description] - """ - try: - state = torch.load(checkpoint_path, map_location=torch.device("cpu")) - except ModuleNotFoundError: - pickle_tts.Unpickler = RenamingUnpickler - state = torch.load(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts) - model.load_state_dict(state["model"]) - if amp and "amp" in state: - amp.load_state_dict(state["amp"]) - if use_cuda: - model.cuda() - # set model stepsize - if hasattr(model.decoder, "r"): - model.decoder.set_r(state["r"]) - print(" > Model r: ", state["r"]) - if eval: - model.eval() - return model, state - - -def save_model(model, optimizer, current_step, epoch, r, output_path, characters, amp_state_dict=None, **kwargs): - """Save ```TTS.tts.models``` states with extra fields. - - Args: - model (TTS.tts.models.Model): models object to be saved. - optimizer (torch.optim.optimizers.Optimizer): model optimizer used for training. - current_step (int): current number of training steps. - epoch (int): current number of training epochs. - r (int): model reduction rate for Tacotron models. - output_path (str): output path to save the model file. - characters (list): list of characters used in the model. - amp_state_dict (state_dict, optional): Apex.amp state dict if Apex is enabled. Defaults to None. - """ - if hasattr(model, "module"): - model_state = model.module.state_dict() - else: - model_state = model.state_dict() - state = { - "model": model_state, - "optimizer": optimizer.state_dict() if optimizer is not None else None, - "step": current_step, - "epoch": epoch, - "date": datetime.date.today().strftime("%B %d, %Y"), - "r": r, - "characters": characters, - } - if amp_state_dict: - state["amp"] = amp_state_dict - state.update(kwargs) - torch.save(state, output_path) - - -def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, characters, **kwargs): - """Save model checkpoint, intended for saving checkpoints at training. - - Args: - model (TTS.tts.models.Model): models object to be saved. - optimizer (torch.optim.optimizers.Optimizer): model optimizer used for training. - current_step (int): current number of training steps. - epoch (int): current number of training epochs. - r (int): model reduction rate for Tacotron models. - output_path (str): output path to save the model file. - characters (list): list of characters used in the model. - """ - file_name = "checkpoint_{}.pth.tar".format(current_step) - checkpoint_path = os.path.join(output_folder, file_name) - print(" > CHECKPOINT : {}".format(checkpoint_path)) - save_model(model, optimizer, current_step, epoch, r, checkpoint_path, characters, **kwargs) - - -def save_best_model( - target_loss, best_loss, model, optimizer, current_step, epoch, r, output_folder, characters, **kwargs -): - """Save model checkpoint, intended for saving the best model after each epoch. - It compares the current model loss with the best loss so far and saves the - model if the current loss is better. - - Args: - target_loss (float): current model loss. - best_loss (float): best loss so far. - model (TTS.tts.models.Model): models object to be saved. - optimizer (torch.optim.optimizers.Optimizer): model optimizer used for training. - current_step (int): current number of training steps. - epoch (int): current number of training epochs. - r (int): model reduction rate for Tacotron models. - output_path (str): output path to save the model file. - characters (list): list of characters used in the model. - - Returns: - float: updated current best loss. - """ - if target_loss < best_loss: - file_name = "best_model.pth.tar" - checkpoint_path = os.path.join(output_folder, file_name) - print(" >> BEST MODEL : {}".format(checkpoint_path)) - save_model( - model, optimizer, current_step, epoch, r, checkpoint_path, characters, model_loss=target_loss, **kwargs - ) - best_loss = target_loss - return best_loss diff --git a/TTS/utils/io.py b/TTS/utils/io.py index 62d972f1..871cff6c 100644 --- a/TTS/utils/io.py +++ b/TTS/utils/io.py @@ -1,7 +1,12 @@ +import datetime +import glob import os import pickle as pickle_tts from shutil import copyfile +import torch +from coqpit import Coqpit + class RenamingUnpickler(pickle_tts.Unpickler): """Overload default pickler to solve module renaming problem""" @@ -41,3 +46,119 @@ def copy_model_files(config, out_path, new_fields): config.audio.stats_path, copy_stats_path, ) + + +def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False): # pylint: disable=redefined-builtin + try: + state = torch.load(checkpoint_path, map_location=torch.device("cpu")) + except ModuleNotFoundError: + pickle_tts.Unpickler = RenamingUnpickler + state = torch.load(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts) + model.load_state_dict(state["model"]) + if use_cuda: + model.cuda() + if eval: + model.eval() + return model, state + + +def save_model(config, model, optimizer, scaler, current_step, epoch, output_path, **kwargs): + if hasattr(model, "module"): + model_state = model.module.state_dict() + else: + model_state = model.state_dict() + if isinstance(optimizer, list): + optimizer_state = [optim.state_dict() for optim in optimizer] + else: + optimizer_state = optimizer.state_dict() if optimizer is not None else None + + if isinstance(scaler, list): + scaler_state = [s.state_dict() for s in scaler] + else: + scaler_state = scaler.state_dict() if scaler is not None else None + + if isinstance(config, Coqpit): + config = config.to_dict() + + state = { + "config": config, + "model": model_state, + "optimizer": optimizer_state, + "scaler": scaler_state, + "step": current_step, + "epoch": epoch, + "date": datetime.date.today().strftime("%B %d, %Y"), + } + state.update(kwargs) + torch.save(state, output_path) + + +def save_checkpoint( + config, + model, + optimizer, + scaler, + current_step, + epoch, + output_folder, + **kwargs, +): + file_name = "checkpoint_{}.pth.tar".format(current_step) + checkpoint_path = os.path.join(output_folder, file_name) + print("\n > CHECKPOINT : {}".format(checkpoint_path)) + save_model( + config, + model, + optimizer, + scaler, + current_step, + epoch, + checkpoint_path, + **kwargs, + ) + + +def save_best_model( + current_loss, + best_loss, + config, + model, + optimizer, + scaler, + current_step, + epoch, + out_path, + keep_all_best=False, + keep_after=10000, + **kwargs, +): + if current_loss < best_loss: + best_model_name = f"best_model_{current_step}.pth.tar" + checkpoint_path = os.path.join(out_path, best_model_name) + print(" > BEST MODEL : {}".format(checkpoint_path)) + save_model( + config, + model, + optimizer, + scaler, + current_step, + epoch, + checkpoint_path, + model_loss=current_loss, + **kwargs, + ) + # only delete previous if current is saved successfully + if not keep_all_best or (current_step < keep_after): + model_names = glob.glob(os.path.join(out_path, "best_model*.pth.tar")) + for model_name in model_names: + if os.path.basename(model_name) == best_model_name: + continue + os.remove(model_name) + # create symlink to best model for convinience + link_name = "best_model.pth.tar" + link_path = os.path.join(out_path, link_name) + if os.path.islink(link_path) or os.path.isfile(link_path): + os.remove(link_path) + os.symlink(best_model_name, os.path.join(out_path, link_name)) + best_loss = current_loss + return best_loss diff --git a/TTS/vocoder/utils/io.py b/TTS/vocoder/utils/io.py deleted file mode 100644 index 9c67535f..00000000 --- a/TTS/vocoder/utils/io.py +++ /dev/null @@ -1,128 +0,0 @@ -import datetime -import glob -import os -import pickle as pickle_tts - -import torch - -from TTS.utils.io import RenamingUnpickler - - -def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False): # pylint: disable=redefined-builtin - try: - state = torch.load(checkpoint_path, map_location=torch.device("cpu")) - except ModuleNotFoundError: - pickle_tts.Unpickler = RenamingUnpickler - state = torch.load(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts) - model.load_state_dict(state["model"]) - if use_cuda: - model.cuda() - if eval: - model.eval() - return model, state - - -def save_model( - model, optimizer, scheduler, model_disc, optimizer_disc, scheduler_disc, current_step, epoch, output_path, **kwargs -): - if hasattr(model, "module"): - model_state = model.module.state_dict() - else: - model_state = model.state_dict() - model_disc_state = model_disc.state_dict() if model_disc is not None else None - optimizer_state = optimizer.state_dict() if optimizer is not None else None - optimizer_disc_state = optimizer_disc.state_dict() if optimizer_disc is not None else None - scheduler_state = scheduler.state_dict() if scheduler is not None else None - scheduler_disc_state = scheduler_disc.state_dict() if scheduler_disc is not None else None - state = { - "model": model_state, - "optimizer": optimizer_state, - "scheduler": scheduler_state, - "model_disc": model_disc_state, - "optimizer_disc": optimizer_disc_state, - "scheduler_disc": scheduler_disc_state, - "step": current_step, - "epoch": epoch, - "date": datetime.date.today().strftime("%B %d, %Y"), - } - state.update(kwargs) - torch.save(state, output_path) - - -def save_checkpoint( - model, - optimizer, - scheduler, - model_disc, - optimizer_disc, - scheduler_disc, - current_step, - epoch, - output_folder, - **kwargs, -): - file_name = "checkpoint_{}.pth.tar".format(current_step) - checkpoint_path = os.path.join(output_folder, file_name) - print(" > CHECKPOINT : {}".format(checkpoint_path)) - save_model( - model, - optimizer, - scheduler, - model_disc, - optimizer_disc, - scheduler_disc, - current_step, - epoch, - checkpoint_path, - **kwargs, - ) - - -def save_best_model( - current_loss, - best_loss, - model, - optimizer, - scheduler, - model_disc, - optimizer_disc, - scheduler_disc, - current_step, - epoch, - out_path, - keep_all_best=False, - keep_after=10000, - **kwargs, -): - if current_loss < best_loss: - best_model_name = f"best_model_{current_step}.pth.tar" - checkpoint_path = os.path.join(out_path, best_model_name) - print(" > BEST MODEL : {}".format(checkpoint_path)) - save_model( - model, - optimizer, - scheduler, - model_disc, - optimizer_disc, - scheduler_disc, - current_step, - epoch, - checkpoint_path, - model_loss=current_loss, - **kwargs, - ) - # only delete previous if current is saved successfully - if not keep_all_best or (current_step < keep_after): - model_names = glob.glob(os.path.join(out_path, "best_model*.pth.tar")) - for model_name in model_names: - if os.path.basename(model_name) == best_model_name: - continue - os.remove(model_name) - # create symlink to best model for convinience - link_name = "best_model.pth.tar" - link_path = os.path.join(out_path, link_name) - if os.path.islink(link_path) or os.path.isfile(link_path): - os.remove(link_path) - os.symlink(best_model_name, os.path.join(out_path, link_name)) - best_loss = current_loss - return best_loss From fcfd95669a27a7b35fe3fe5e83ecbf2a0f0cb61b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:24:48 +0200 Subject: [PATCH 194/258] Update model test configs --- tests/inputs/test_align_tts.json | 6 +++--- tests/inputs/test_glow_tts.json | 6 +++--- tests/inputs/test_speedy_speech.json | 6 +++--- tests/inputs/test_tacotron2_config.json | 6 +++--- tests/inputs/test_tacotron_bd_config.json | 6 +++--- tests/inputs/test_tacotron_config.json | 6 +++--- tests/inputs/test_vocoder_multiband_melgan_config.json | 2 +- tests/inputs/test_vocoder_wavegrad.json | 4 ++-- tests/inputs/test_vocoder_wavernn_config.json | 4 ++-- 9 files changed, 23 insertions(+), 23 deletions(-) diff --git a/tests/inputs/test_align_tts.json b/tests/inputs/test_align_tts.json index 964cc66d..a0d677ad 100644 --- a/tests/inputs/test_align_tts.json +++ b/tests/inputs/test_align_tts.json @@ -123,7 +123,7 @@ "text_cleaner": "english_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "min_seq_len": 2, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 300, // DATASET-RELATED: maximum text length @@ -140,8 +140,8 @@ // MULTI-SPEAKER and GST "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. - "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 - "external_speaker_embedding_file": "/home/erogol/Data/libritts/speakers.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "use_d_vector_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "d_vector_file": "/home/erogol/Data/libritts/speakers.json", // if not null and use_d_vector_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 // DATASETS diff --git a/tests/inputs/test_glow_tts.json b/tests/inputs/test_glow_tts.json index 64cc3822..6dd86057 100644 --- a/tests/inputs/test_glow_tts.json +++ b/tests/inputs/test_glow_tts.json @@ -115,7 +115,7 @@ "text_cleaner": "phoneme_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "min_seq_len": 3, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 500, // DATASET-RELATED: maximum text length @@ -132,8 +132,8 @@ "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages // MULTI-SPEAKER and GST - "use_external_speaker_embedding_file": false, - "external_speaker_embedding_file": null, + "use_d_vector_file": false, + "d_vector_file": null, "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. // DATASETS diff --git a/tests/inputs/test_speedy_speech.json b/tests/inputs/test_speedy_speech.json index a29fc992..02783d21 100644 --- a/tests/inputs/test_speedy_speech.json +++ b/tests/inputs/test_speedy_speech.json @@ -120,7 +120,7 @@ "text_cleaner": "english_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "min_seq_len": 2, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 300, // DATASET-RELATED: maximum text length @@ -137,8 +137,8 @@ // MULTI-SPEAKER and GST "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. - "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 - "external_speaker_embedding_file": "/home/erogol/Data/libritts/speakers.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "use_d_vector_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "d_vector_file": "/home/erogol/Data/libritts/speakers.json", // if not null and use_d_vector_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 // DATASETS diff --git a/tests/inputs/test_tacotron2_config.json b/tests/inputs/test_tacotron2_config.json index cc2c1bb5..6c82891d 100644 --- a/tests/inputs/test_tacotron2_config.json +++ b/tests/inputs/test_tacotron2_config.json @@ -130,7 +130,7 @@ "text_cleaner": "phoneme_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 153, // DATASET-RELATED: maximum text length @@ -145,8 +145,8 @@ "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages // MULTI-SPEAKER and GST - "use_external_speaker_embedding_file": false, - "external_speaker_embedding_file": null, + "use_d_vector_file": false, + "d_vector_file": null, "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. "use_gst": true, // use global style tokens "gst": { // gst parameter if gst is enabled diff --git a/tests/inputs/test_tacotron_bd_config.json b/tests/inputs/test_tacotron_bd_config.json index 9d2935aa..fbf3c001 100644 --- a/tests/inputs/test_tacotron_bd_config.json +++ b/tests/inputs/test_tacotron_bd_config.json @@ -130,7 +130,7 @@ "text_cleaner": "phoneme_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 153, // DATASET-RELATED: maximum text length @@ -145,8 +145,8 @@ "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages // MULTI-SPEAKER and GST - "use_external_speaker_embedding_file": false, - "external_speaker_embedding_file": null, + "use_d_vector_file": false, + "d_vector_file": null, "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. "use_gst": true, // use global style tokens "gst": { // gst parameter if gst is enabled diff --git a/tests/inputs/test_tacotron_config.json b/tests/inputs/test_tacotron_config.json index c8fae623..b60ed35e 100644 --- a/tests/inputs/test_tacotron_config.json +++ b/tests/inputs/test_tacotron_config.json @@ -130,7 +130,7 @@ "text_cleaner": "phoneme_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 153, // DATASET-RELATED: maximum text length @@ -145,8 +145,8 @@ "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages // MULTI-SPEAKER and GST - "use_external_speaker_embedding_file": false, - "external_speaker_embedding_file": null, + "use_d_vector_file": false, + "d_vector_file": null, "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. "use_gst": true, // use global style tokens "gst": { // gst parameter if gst is enabled diff --git a/tests/inputs/test_vocoder_multiband_melgan_config.json b/tests/inputs/test_vocoder_multiband_melgan_config.json index 794a3fcc..b8b192e4 100644 --- a/tests/inputs/test_vocoder_multiband_melgan_config.json +++ b/tests/inputs/test_vocoder_multiband_melgan_config.json @@ -157,7 +157,7 @@ // DATA LOADING "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "eval_split_size": 10, // PATHS diff --git a/tests/inputs/test_vocoder_wavegrad.json b/tests/inputs/test_vocoder_wavegrad.json index f6208e8d..6378c07a 100644 --- a/tests/inputs/test_vocoder_wavegrad.json +++ b/tests/inputs/test_vocoder_wavegrad.json @@ -88,7 +88,7 @@ // OPTIMIZER "epochs": 1, // total number of epochs to train. - "clip_grad": 1.0, // Generator gradient clipping threshold. Apply gradient clipping if > 0 + "grad_clip": 1.0, // Generator gradient clipping threshold. Apply gradient clipping if > 0 "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate "lr_scheduler_params": { "gamma": 0.5, @@ -107,7 +107,7 @@ // DATA LOADING "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "eval_split_size": 4, // PATHS diff --git a/tests/inputs/test_vocoder_wavernn_config.json b/tests/inputs/test_vocoder_wavernn_config.json index decafa70..ee4e5f8e 100644 --- a/tests/inputs/test_vocoder_wavernn_config.json +++ b/tests/inputs/test_vocoder_wavernn_config.json @@ -55,7 +55,7 @@ "padding": 2, // pad the input for resnet to see wider input length // GENERATOR - for backward compatibility - "generator_model": "WaveRNN", + "generator_model": "Wavernn", // DATASET //"use_gta": true, // use computed gta features from the tts model @@ -103,7 +103,7 @@ // DATA LOADING "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "eval_split_size": 10, // number of samples for testing // PATHS From 626c9d41e63acd04401fb0a4003599c4f4d9fc26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:27:19 +0200 Subject: [PATCH 195/258] Update tests for the new trainer API --- tests/inference_tests/test_synthesizer.py | 12 +- tests/test_extract_tts_spectrograms.py | 10 +- tests/tts_tests/test_align_tts_train.py | 2 +- tests/tts_tests/test_glow_tts.py | 89 +------------- tests/tts_tests/test_glow_tts_train.py | 2 +- tests/tts_tests/test_speedy_speech_layers.py | 27 +++- tests/tts_tests/test_speedy_speech_train.py | 4 +- .../test_tacotron2_d-vectors_train.py | 6 +- tests/tts_tests/test_tacotron2_model.py | 61 ++++++---- .../test_tacotron2_speaker_emb_train.py | 2 +- tests/tts_tests/test_tacotron2_tf_model.py | 2 +- tests/tts_tests/test_tacotron2_train.py | 2 +- tests/tts_tests/test_tacotron_model.py | 115 ++++++------------ tests/tts_tests/test_tacotron_train.py | 2 +- .../test_fullband_melgan_train.py | 8 +- tests/vocoder_tests/test_hifigan_train.py | 8 +- tests/vocoder_tests/test_melgan_train.py | 8 +- .../test_multiband_melgan_train.py | 8 +- .../test_parallel_wavegan_train.py | 8 +- tests/vocoder_tests/test_vocoder_wavernn.py | 28 ++++- tests/vocoder_tests/test_wavegrad.py | 14 +-- tests/vocoder_tests/test_wavegrad_layers.py | 7 +- tests/vocoder_tests/test_wavegrad_train.py | 10 +- tests/vocoder_tests/test_wavernn_train.py | 11 +- 24 files changed, 174 insertions(+), 272 deletions(-) diff --git a/tests/inference_tests/test_synthesizer.py b/tests/inference_tests/test_synthesizer.py index 4379c8ca..5972dc90 100644 --- a/tests/inference_tests/test_synthesizer.py +++ b/tests/inference_tests/test_synthesizer.py @@ -3,8 +3,7 @@ import unittest from TTS.config import load_config from TTS.tts.models import setup_model -from TTS.tts.utils.io import save_checkpoint -from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols +from TTS.utils.io import save_checkpoint from TTS.utils.synthesizer import Synthesizer from .. import get_tests_output_path @@ -14,15 +13,10 @@ class SynthesizerTest(unittest.TestCase): # pylint: disable=R0201 def _create_random_model(self): # pylint: disable=global-statement - global symbols, phonemes config = load_config(os.path.join(get_tests_output_path(), "dummy_model_config.json")) - if config.has("characters") and config.characters: - symbols, phonemes = make_symbols(**config.characters.to_dict()) - - num_chars = len(phonemes) if config.use_phonemes else len(symbols) - model = setup_model(num_chars, 0, config) + model = setup_model(config) output_path = os.path.join(get_tests_output_path()) - save_checkpoint(model, None, 10, 10, 1, output_path, None) + save_checkpoint(config, model, None, None, 10, 1, output_path) def test_in_out(self): self._create_random_model() diff --git a/tests/test_extract_tts_spectrograms.py b/tests/test_extract_tts_spectrograms.py index d16167ed..8c795d58 100644 --- a/tests/test_extract_tts_spectrograms.py +++ b/tests/test_extract_tts_spectrograms.py @@ -6,7 +6,6 @@ import torch from tests import get_tests_input_path, get_tests_output_path, run_cli from TTS.config import load_config from TTS.tts.models import setup_model -from TTS.tts.utils.text.symbols import phonemes, symbols torch.manual_seed(1) @@ -21,8 +20,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): # load config c = load_config(config_path) # create model - num_chars = len(phonemes if c.use_phonemes else symbols) - model = setup_model(num_chars, 1, c, d_vector_dim=None) + model = setup_model(c) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test @@ -40,8 +38,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): # load config c = load_config(config_path) # create model - num_chars = len(phonemes if c.use_phonemes else symbols) - model = setup_model(num_chars, 1, c, d_vector_dim=None) + model = setup_model(c) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test @@ -59,8 +56,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): # load config c = load_config(config_path) # create model - num_chars = len(phonemes if c.use_phonemes else symbols) - model = setup_model(num_chars, 1, c, d_vector_dim=None) + model = setup_model(c) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test diff --git a/tests/tts_tests/test_align_tts_train.py b/tests/tts_tests/test_align_tts_train.py index 61d67c5c..3700b1d3 100644 --- a/tests/tts_tests/test_align_tts_train.py +++ b/tests/tts_tests/test_align_tts_train.py @@ -13,7 +13,7 @@ config = AlignTTSConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=False, phoneme_language="en-us", diff --git a/tests/tts_tests/test_glow_tts.py b/tests/tts_tests/test_glow_tts.py index 8a2a8fb3..171f2cdc 100644 --- a/tests/tts_tests/test_glow_tts.py +++ b/tests/tts_tests/test_glow_tts.py @@ -41,64 +41,11 @@ class GlowTTSTrainTest(unittest.TestCase): criterion = GlowTTSLoss() # model to train - model = GlowTTS( - num_chars=32, - hidden_channels_enc=48, - hidden_channels_dec=48, - hidden_channels_dp=32, - out_channels=80, - encoder_type="rel_pos_transformer", - encoder_params={ - "kernel_size": 3, - "dropout_p": 0.1, - "num_layers": 6, - "num_heads": 2, - "hidden_channels_ffn": 16, # 4 times the hidden_channels - "input_length": None, - }, - use_encoder_prenet=True, - num_flow_blocks_dec=12, - kernel_size_dec=5, - dilation_rate=1, - num_block_layers=4, - dropout_p_dec=0.0, - num_speakers=0, - c_in_channels=0, - num_splits=4, - num_squeeze=1, - sigmoid_scale=False, - mean_only=False, - ).to(device) + config = GlowTTSConfig(num_chars=32) + model = GlowTTS(config).to(device) # reference model to compare model weights - model_ref = GlowTTS( - num_chars=32, - hidden_channels_enc=48, - hidden_channels_dec=48, - hidden_channels_dp=32, - out_channels=80, - encoder_type="rel_pos_transformer", - encoder_params={ - "kernel_size": 3, - "dropout_p": 0.1, - "num_layers": 6, - "num_heads": 2, - "hidden_channels_ffn": 16, # 4 times the hidden_channels - "input_length": None, - }, - use_encoder_prenet=True, - num_flow_blocks_dec=12, - kernel_size_dec=5, - dilation_rate=1, - num_block_layers=4, - dropout_p_dec=0.0, - num_speakers=0, - c_in_channels=0, - num_splits=4, - num_squeeze=1, - sigmoid_scale=False, - mean_only=False, - ).to(device) + model_ref = GlowTTS(config).to(device) model.train() print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) @@ -149,34 +96,8 @@ class GlowTTSInferenceTest(unittest.TestCase): speaker_ids = torch.randint(0, 5, (8,)).long().to(device) # create model - model = GlowTTS( - num_chars=32, - hidden_channels_enc=48, - hidden_channels_dec=48, - hidden_channels_dp=32, - out_channels=80, - encoder_type="rel_pos_transformer", - encoder_params={ - "kernel_size": 3, - "dropout_p": 0.1, - "num_layers": 6, - "num_heads": 2, - "hidden_channels_ffn": 16, # 4 times the hidden_channels - "input_length": None, - }, - use_encoder_prenet=True, - num_flow_blocks_dec=12, - kernel_size_dec=5, - dilation_rate=1, - num_block_layers=4, - dropout_p_dec=0.0, - num_speakers=0, - c_in_channels=0, - num_splits=4, - num_squeeze=1, - sigmoid_scale=False, - mean_only=False, - ).to(device) + config = GlowTTSConfig(num_chars=32) + model = GlowTTS(config).to(device) model.eval() print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py index 30aaefc4..24c5c4cf 100644 --- a/tests/tts_tests/test_glow_tts_train.py +++ b/tests/tts_tests/test_glow_tts_train.py @@ -13,7 +13,7 @@ config = GlowTTSConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=True, use_espeak_phonemes=True, diff --git a/tests/tts_tests/test_speedy_speech_layers.py b/tests/tts_tests/test_speedy_speech_layers.py index d2f62d49..a5c481f1 100644 --- a/tests/tts_tests/test_speedy_speech_layers.py +++ b/tests/tts_tests/test_speedy_speech_layers.py @@ -1,7 +1,8 @@ import torch +from TTS.tts.configs import SpeedySpeechConfig from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor -from TTS.tts.models.speedy_speech import SpeedySpeech +from TTS.tts.models.speedy_speech import SpeedySpeech, SpeedySpeechArgs from TTS.tts.utils.data import sequence_mask use_cuda = torch.cuda.is_available() @@ -40,7 +41,8 @@ def test_speedy_speech(): y_lengths = durations.sum(1) - model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128) + config = SpeedySpeechConfig(model_args=SpeedySpeechArgs(num_chars=num_chars, out_channels=80, hidden_channels=128)) + model = SpeedySpeech(config) if use_cuda: model.cuda() @@ -55,7 +57,12 @@ def test_speedy_speech(): assert list(o_dr.shape) == [B, T_en] # with speaker embedding - model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128, num_speakers=10, c_in_channels=256).to(device) + config = SpeedySpeechConfig( + model_args=SpeedySpeechArgs( + num_chars=num_chars, out_channels=80, hidden_channels=128, num_speakers=80, d_vector_dim=256 + ) + ) + model = SpeedySpeech(config).to(device) model.forward( x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.randint(0, 10, (B,)).to(device)} ) @@ -68,9 +75,17 @@ def test_speedy_speech(): assert list(o_dr.shape) == [B, T_en] # with speaker external embedding - model = SpeedySpeech( - num_chars, out_channels=80, hidden_channels=128, num_speakers=10, external_c=True, c_in_channels=256 - ).to(device) + config = SpeedySpeechConfig( + model_args=SpeedySpeechArgs( + num_chars=num_chars, + out_channels=80, + hidden_channels=128, + num_speakers=10, + use_d_vector=True, + d_vector_dim=256, + ) + ) + model = SpeedySpeech(config).to(device) model.forward(x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.rand((B, 256)).to(device)}) o_de = outputs["model_outputs"] attn = outputs["alignments"] diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index d677f46f..28dc7029 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -4,16 +4,18 @@ import shutil from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs import SpeedySpeechConfig +from TTS.tts.models.speedy_speech import SpeedySpeechArgs config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") config = SpeedySpeechConfig( + model_args=SpeedySpeechArgs(num_chars=50, out_channels=80, hidden_channels=128, num_speakers=0), batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=True, phoneme_language="en-us", diff --git a/tests/tts_tests/test_tacotron2_d-vectors_train.py b/tests/tts_tests/test_tacotron2_d-vectors_train.py index 7fda7e09..3313b8c4 100644 --- a/tests/tts_tests/test_tacotron2_d-vectors_train.py +++ b/tests/tts_tests/test_tacotron2_d-vectors_train.py @@ -13,7 +13,7 @@ config = Tacotron2Config( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=False, phoneme_language="en-us", @@ -24,11 +24,11 @@ config = Tacotron2Config( print_step=1, print_eval=True, use_speaker_embedding=True, - use_external_speaker_embedding_file=True, + use_d_vector_file=True, test_sentences=[ "Be a voice, not an echo.", ], - external_speaker_embedding_file="tests/data/ljspeech/speakers.json", + d_vector_file="tests/data/ljspeech/speakers.json", max_decoder_steps=50, ) diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index fc3d9799..a8132467 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -7,6 +7,7 @@ from torch import nn, optim from tests import get_tests_input_path from TTS.tts.configs import Tacotron2Config +from TTS.tts.configs.shared_configs import GSTConfig from TTS.tts.layers.losses import MSELossMasked from TTS.tts.models.tacotron2 import Tacotron2 from TTS.utils.audio import AudioProcessor @@ -17,19 +18,20 @@ torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -c = Tacotron2Config() +config_global = Tacotron2Config(num_chars=32, num_speakers=5, out_channels=80, decoder_output_dim=80) -ap = AudioProcessor(**c.audio) +ap = AudioProcessor(**config_global.audio) WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") class TacotronTrainTest(unittest.TestCase): def test_train_step(self): # pylint: disable=no-self-use + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -38,19 +40,19 @@ class TacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5).to(device) + model = Tacotron2(config).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for i in range(5): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} @@ -77,11 +79,12 @@ class TacotronTrainTest(unittest.TestCase): class MultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -90,19 +93,20 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55).to(device) + config.d_vector_dim = 55 + model = Tacotron2(config).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for i in range(5): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_ids} @@ -130,11 +134,12 @@ class TacotronGSTTrainTest(unittest.TestCase): # pylint: disable=no-self-use def test_train_step(self): # with random gst mel style + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -143,19 +148,21 @@ class TacotronGSTTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, use_gst=True, gst=c.gst).to(device) + config.use_gst = True + config.gst = GSTConfig() + model = Tacotron2(config).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for i in range(10): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} @@ -190,7 +197,7 @@ class TacotronGSTTrainTest(unittest.TestCase): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -199,19 +206,19 @@ class TacotronGSTTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, use_gst=True, gst=c.gst).to(device) + model = Tacotron2(config).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for i in range(10): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} @@ -242,11 +249,12 @@ class TacotronGSTTrainTest(unittest.TestCase): class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -255,18 +263,19 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55, use_gst=True, gst=c.gst).to(device) + config.d_vector_dim = 55 + model = Tacotron2(config).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for i in range(5): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings} diff --git a/tests/tts_tests/test_tacotron2_speaker_emb_train.py b/tests/tts_tests/test_tacotron2_speaker_emb_train.py index a242c724..41d694f6 100644 --- a/tests/tts_tests/test_tacotron2_speaker_emb_train.py +++ b/tests/tts_tests/test_tacotron2_speaker_emb_train.py @@ -13,7 +13,7 @@ config = Tacotron2Config( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=False, phoneme_language="en-us", diff --git a/tests/tts_tests/test_tacotron2_tf_model.py b/tests/tts_tests/test_tacotron2_tf_model.py index ee7f720b..431b0c2f 100644 --- a/tests/tts_tests/test_tacotron2_tf_model.py +++ b/tests/tts_tests/test_tacotron2_tf_model.py @@ -110,7 +110,7 @@ class TacotronTFTrainTest(unittest.TestCase): num_chars=24, num_speakers=0, r=3, - postnet_output_dim=80, + out_channels=80, decoder_output_dim=80, attn_type="original", attn_win=False, diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py index 577de014..e947a54a 100644 --- a/tests/tts_tests/test_tacotron2_train.py +++ b/tests/tts_tests/test_tacotron2_train.py @@ -13,7 +13,7 @@ config = Tacotron2Config( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=False, phoneme_language="en-us", diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py index 2abd968d..6c673568 100644 --- a/tests/tts_tests/test_tacotron_model.py +++ b/tests/tts_tests/test_tacotron_model.py @@ -6,7 +6,7 @@ import torch from torch import nn, optim from tests import get_tests_input_path -from TTS.tts.configs import TacotronConfig +from TTS.tts.configs import GSTConfig, TacotronConfig from TTS.tts.layers.losses import L1LossMasked from TTS.tts.models.tacotron import Tacotron from TTS.utils.audio import AudioProcessor @@ -17,9 +17,9 @@ torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -c = TacotronConfig() +config_global = TacotronConfig(num_chars=32, num_speakers=5, out_channels=513, decoder_output_dim=80) -ap = AudioProcessor(**c.audio) +ap = AudioProcessor(**config_global.audio) WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") @@ -31,11 +31,12 @@ def count_parameters(model): class TacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[-1] = mel_spec.size(1) stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -44,21 +45,12 @@ class TacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron( - num_chars=32, - num_speakers=5, - postnet_output_dim=c.audio["fft_size"], - decoder_output_dim=c.audio["num_mels"], - r=c.r, - memory_size=c.memory_size, - ).to( - device - ) # FIXME: missing num_speakers parameter to Tacotron ctor + model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) model_ref = copy.deepcopy(model) @@ -66,7 +58,7 @@ class TacotronTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for _ in range(5): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} @@ -91,11 +83,12 @@ class TacotronTrainTest(unittest.TestCase): class MultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[-1] = mel_spec.size(1) stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -104,22 +97,13 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron( - num_chars=32, - num_speakers=5, - postnet_output_dim=c.audio["fft_size"], - decoder_output_dim=c.audio["num_mels"], - r=c.r, - memory_size=c.memory_size, - d_vector_dim=55, - ).to( - device - ) # FIXME: missing num_speakers parameter to Tacotron ctor + config.d_vector_dim = 55 + model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) model_ref = copy.deepcopy(model) @@ -127,7 +111,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for _ in range(5): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings} @@ -152,12 +136,13 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): class TacotronGSTTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() # with random gst mel style input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 120, c.audio["num_mels"]).to(device) - linear_spec = torch.rand(8, 120, c.audio["fft_size"]).to(device) + mel_spec = torch.rand(8, 120, config.audio["num_mels"]).to(device) + linear_spec = torch.rand(8, 120, config.audio["fft_size"] // 2 + 1).to(device) mel_lengths = torch.randint(20, 120, (8,)).long().to(device) mel_lengths[-1] = 120 stop_targets = torch.zeros(8, 120, 1).float().to(device) @@ -166,23 +151,14 @@ class TacotronGSTTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron( - num_chars=32, - num_speakers=5, - use_gst=True, - gst=c.gst, - postnet_output_dim=c.audio["fft_size"], - decoder_output_dim=c.audio["num_mels"], - r=c.r, - memory_size=c.memory_size, - ).to( - device - ) # FIXME: missing num_speakers parameter to Tacotron ctor + config.use_gst = True + config.gst = GSTConfig() + model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() # print(model) print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model))) @@ -191,7 +167,7 @@ class TacotronGSTTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for _ in range(10): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} @@ -220,7 +196,7 @@ class TacotronGSTTrainTest(unittest.TestCase): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - linear_spec = torch.rand(8, mel_spec.size(1), c.audio["fft_size"]).to(device) + linear_spec = torch.rand(8, mel_spec.size(1), config.audio["fft_size"] // 2 + 1).to(device) mel_lengths = torch.randint(20, mel_spec.size(1), (8,)).long().to(device) mel_lengths[-1] = mel_spec.size(1) stop_targets = torch.zeros(8, mel_spec.size(1), 1).float().to(device) @@ -229,23 +205,12 @@ class TacotronGSTTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron( - num_chars=32, - num_speakers=5, - use_gst=True, - gst=c.gst, - postnet_output_dim=c.audio["fft_size"], - decoder_output_dim=c.audio["num_mels"], - r=c.r, - memory_size=c.memory_size, - ).to( - device - ) # FIXME: missing num_speakers parameter to Tacotron ctor + model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() # print(model) print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model))) @@ -254,7 +219,7 @@ class TacotronGSTTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for _ in range(10): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} @@ -278,11 +243,12 @@ class TacotronGSTTrainTest(unittest.TestCase): class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[-1] = mel_spec.size(1) stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -291,24 +257,13 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron( - num_chars=32, - num_speakers=5, - postnet_output_dim=c.audio["fft_size"], - decoder_output_dim=c.audio["num_mels"], - use_gst=True, - gst=c.gst, - r=c.r, - memory_size=c.memory_size, - d_vector_dim=55, - ).to( - device - ) # FIXME: missing num_speakers parameter to Tacotron ctor + config.d_vector_dim = 55 + model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) model_ref = copy.deepcopy(model) @@ -316,7 +271,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for _ in range(5): outputs = model.forward( input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings} diff --git a/tests/tts_tests/test_tacotron_train.py b/tests/tts_tests/test_tacotron_train.py index 010154e2..0c35ee28 100644 --- a/tests/tts_tests/test_tacotron_train.py +++ b/tests/tts_tests/test_tacotron_train.py @@ -13,7 +13,7 @@ config = TacotronConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=False, phoneme_language="en-us", diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py index f93a5318..9d4e1933 100644 --- a/tests/vocoder_tests/test_fullband_melgan_train.py +++ b/tests/vocoder_tests/test_fullband_melgan_train.py @@ -12,7 +12,7 @@ config = FullbandMelganConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -29,9 +29,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_hifigan_train.py b/tests/vocoder_tests/test_hifigan_train.py index 11057570..c506fb48 100644 --- a/tests/vocoder_tests/test_hifigan_train.py +++ b/tests/vocoder_tests/test_hifigan_train.py @@ -13,7 +13,7 @@ config = HifiganConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -29,9 +29,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py index 551b786a..6ef9cd49 100644 --- a/tests/vocoder_tests/test_melgan_train.py +++ b/tests/vocoder_tests/test_melgan_train.py @@ -12,7 +12,7 @@ config = MelganConfig( batch_size=4, eval_batch_size=4, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -29,9 +29,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_multiband_melgan_train.py b/tests/vocoder_tests/test_multiband_melgan_train.py index 5c6a0fc8..daf2841b 100644 --- a/tests/vocoder_tests/test_multiband_melgan_train.py +++ b/tests/vocoder_tests/test_multiband_melgan_train.py @@ -12,7 +12,7 @@ config = MultibandMelganConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -30,9 +30,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -40,7 +38,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_parallel_wavegan_train.py b/tests/vocoder_tests/test_parallel_wavegan_train.py index fb6ea87c..a126befe 100644 --- a/tests/vocoder_tests/test_parallel_wavegan_train.py +++ b/tests/vocoder_tests/test_parallel_wavegan_train.py @@ -12,7 +12,7 @@ config = ParallelWaveganConfig( batch_size=4, eval_batch_size=4, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -28,9 +28,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -38,7 +36,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_vocoder_wavernn.py b/tests/vocoder_tests/test_vocoder_wavernn.py index 9c58fa1c..b5c769ee 100644 --- a/tests/vocoder_tests/test_vocoder_wavernn.py +++ b/tests/vocoder_tests/test_vocoder_wavernn.py @@ -3,11 +3,13 @@ import random import numpy as np import torch -from TTS.vocoder.models.wavernn import WaveRNN +from TTS.vocoder.configs import WavernnConfig +from TTS.vocoder.models.wavernn import Wavernn, WavernnArgs def test_wavernn(): - model = WaveRNN( + config = WavernnConfig() + config.model_args = WavernnArgs( rnn_dims=512, fc_dims=512, mode=10, @@ -20,14 +22,30 @@ def test_wavernn(): compute_dims=128, res_out_dims=128, num_res_blocks=10, - hop_length=256, - sample_rate=22050, ) + config.audio.hop_length = 256 + config.audio.sample_rate = 2048 + dummy_x = torch.rand((2, 1280)) dummy_m = torch.rand((2, 80, 9)) y_size = random.randrange(20, 60) dummy_y = torch.rand((80, y_size)) + + # mode: mold + model = Wavernn(config) output = model(dummy_x, dummy_m) - assert np.all(output.shape == (2, 1280, 4 * 256)), output.shape + assert np.all(output.shape == (2, 1280, 30)), output.shape + + # mode: gauss + config.model_params.mode = "gauss" + model = Wavernn(config) + output = model(dummy_x, dummy_m) + assert np.all(output.shape == (2, 1280, 2)), output.shape + + # mode: quantized + config.model_params.mode = 4 + model = Wavernn(config) + output = model(dummy_x, dummy_m) + assert np.all(output.shape == (2, 1280, 2 ** 4)), output.shape output = model.inference(dummy_y, True, 5500, 550) assert np.all(output.shape == (256 * (y_size - 1),)) diff --git a/tests/vocoder_tests/test_wavegrad.py b/tests/vocoder_tests/test_wavegrad.py index a28409e5..43b5f080 100644 --- a/tests/vocoder_tests/test_wavegrad.py +++ b/tests/vocoder_tests/test_wavegrad.py @@ -4,7 +4,8 @@ import numpy as np import torch from torch import optim -from TTS.vocoder.models.wavegrad import Wavegrad +from TTS.vocoder.configs import WavegradConfig +from TTS.vocoder.models.wavegrad import Wavegrad, WavegradArgs # pylint: disable=unused-variable @@ -20,19 +21,16 @@ class WavegradTrainTest(unittest.TestCase): mel_spec = torch.rand(8, 80, 20).to(device) criterion = torch.nn.L1Loss().to(device) - model = Wavegrad( + args = WavegradArgs( in_channels=80, out_channels=1, upsample_factors=[5, 5, 3, 2, 2], upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], ) + config = WavegradConfig(model_params=args) + model = Wavegrad(config) - model_ref = Wavegrad( - in_channels=80, - out_channels=1, - upsample_factors=[5, 5, 3, 2, 2], - upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], - ) + model_ref = Wavegrad(config) model.train() model.to(device) betas = np.linspace(1e-6, 1e-2, 1000) diff --git a/tests/vocoder_tests/test_wavegrad_layers.py b/tests/vocoder_tests/test_wavegrad_layers.py index 0180eb0a..a0b021dc 100644 --- a/tests/vocoder_tests/test_wavegrad_layers.py +++ b/tests/vocoder_tests/test_wavegrad_layers.py @@ -1,7 +1,8 @@ import torch +from TTS.vocoder.configs import WavegradConfig from TTS.vocoder.layers.wavegrad import DBlock, FiLM, PositionalEncoding, UBlock -from TTS.vocoder.models.wavegrad import Wavegrad +from TTS.vocoder.models.wavegrad import Wavegrad, WavegradArgs def test_positional_encoding(): @@ -75,12 +76,14 @@ def test_wavegrad_forward(): c = torch.rand(32, 80, 20) noise_scale = torch.rand(32) - model = Wavegrad( + args = WavegradArgs( in_channels=80, out_channels=1, upsample_factors=[5, 5, 3, 2, 2], upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], ) + config = WavegradConfig(model_params=args) + model = Wavegrad(config) o = model.forward(x, c, noise_scale) assert o.shape[0] == 32 diff --git a/tests/vocoder_tests/test_wavegrad_train.py b/tests/vocoder_tests/test_wavegrad_train.py index e222de3a..fe56ee78 100644 --- a/tests/vocoder_tests/test_wavegrad_train.py +++ b/tests/vocoder_tests/test_wavegrad_train.py @@ -12,7 +12,7 @@ config = WavegradConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -29,15 +29,15 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavegrad.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavegrad.py --continue_path {continue_path} " +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " +) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_wavernn_train.py b/tests/vocoder_tests/test_wavernn_train.py index 414ed719..43fc5fb1 100644 --- a/tests/vocoder_tests/test_wavernn_train.py +++ b/tests/vocoder_tests/test_wavernn_train.py @@ -4,15 +4,18 @@ import shutil from tests import get_device_id, get_tests_output_path, run_cli from TTS.vocoder.configs import WavernnConfig +from TTS.vocoder.models.wavernn import WavernnArgs config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") + config = WavernnConfig( + model_params=WavernnArgs(), batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -28,9 +31,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavernn.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -38,7 +39,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavernn.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) From 786170fe7d5da036bbb44fa269f9e20865a9354f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:28:58 +0200 Subject: [PATCH 196/258] Update tts model configs --- TTS/tts/configs/align_tts_config.py | 21 ++--- TTS/tts/configs/glow_tts_config.py | 84 ++++++++++++++++--- TTS/tts/configs/shared_configs.py | 12 ++- TTS/tts/configs/speedy_speech_config.py | 56 ++----------- TTS/tts/configs/tacotron2_config.py | 105 +----------------------- TTS/tts/configs/tacotron_config.py | 39 +++++++-- 6 files changed, 133 insertions(+), 184 deletions(-) diff --git a/TTS/tts/configs/align_tts_config.py b/TTS/tts/configs/align_tts_config.py index 56622741..837cd519 100644 --- a/TTS/tts/configs/align_tts_config.py +++ b/TTS/tts/configs/align_tts_config.py @@ -2,6 +2,7 @@ from dataclasses import dataclass, field from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig +from TTS.tts.models.align_tts import AlignTTSArgs @dataclass @@ -49,9 +50,9 @@ class AlignTTSConfig(BaseTTSConfig): use_speaker_embedding (bool): enable / disable using speaker embeddings for multi-speaker models. If set True, the model is in the multi-speaker mode. Defaults to False. - use_external_speaker_embedding_file (bool): + use_d_vector_file (bool): enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. - external_speaker_embedding_file (str): + d_vector_file (str): Path to the file including pre-computed speaker embeddings. Defaults to None. noam_schedule (bool): enable / disable the use of Noam LR scheduler. Defaults to False. @@ -68,17 +69,7 @@ class AlignTTSConfig(BaseTTSConfig): model: str = "align_tts" # model specific params - positional_encoding: bool = True - hidden_channels_dp: int = 256 - hidden_channels: int = 256 - encoder_type: str = "fftransformer" - encoder_params: dict = field( - default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1} - ) - decoder_type: str = "fftransformer" - decoder_params: dict = field( - default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1} - ) + model_args: AlignTTSArgs = field(default_factory=AlignTTSArgs) phase_start_steps: List[int] = None ssim_alpha: float = 1.0 @@ -88,8 +79,8 @@ class AlignTTSConfig(BaseTTSConfig): # multi-speaker settings use_speaker_embedding: bool = False - use_external_speaker_embedding_file: bool = False - external_speaker_embedding_file: str = False + use_d_vector_file: bool = False + d_vector_file: str = False # optimizer parameters optimizer: str = "Adam" diff --git a/TTS/tts/configs/glow_tts_config.py b/TTS/tts/configs/glow_tts_config.py index 925854c9..19b7abd9 100644 --- a/TTS/tts/configs/glow_tts_config.py +++ b/TTS/tts/configs/glow_tts_config.py @@ -23,13 +23,49 @@ class GlowTTSConfig(BaseTTSConfig): Defaults to `{"kernel_size": 3, "dropout_p": 0.1, "num_layers": 6, "num_heads": 2, "hidden_channels_ffn": 768}` use_encoder_prenet (bool): enable / disable the use of a prenet for the encoder. Defaults to True. - hidden_channels_encoder (int): + hidden_channels_enc (int): Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes, and for some encoder types internal hidden channels sizes too. Defaults to 192. - hidden_channels_decoder (int): + hidden_channels_dec (int): Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work. - hidden_channels_duration_predictor (int): + hidden_channels_dp (int): Number of layer channels of the duration predictor network. Defaults to 256 as in the original work. + mean_only (bool): + If true predict only the mean values by the decoder flow. Defaults to True. + out_channels (int): + Number of channels of the model output tensor. Defaults to 80. + num_flow_blocks_dec (int): + Number of decoder blocks. Defaults to 12. + inference_noise_scale (float): + Noise scale used at inference. Defaults to 0.33. + kernel_size_dec (int): + Decoder kernel size. Defaults to 5 + dilation_rate (int): + Rate to increase dilation by each layer in a decoder block. Defaults to 5. + num_block_layers (int): + Number of decoder layers in each decoder block. Defaults to 4. + dropout_p_dec (float): + Dropout rate for decoder. Defaults to 0.1. + num_speaker (int): + Number of speaker to define the size of speaker embedding layer. Defaults to 0. + c_in_channels (int): + Number of speaker embedding channels. It is set to 512 if embeddings are learned. Defaults to 0. + num_splits (int): + Number of split levels in inversible conv1x1 operation. Defaults to 4. + num_squeeze (int): + Number of squeeze levels. When squeezing channels increases and time steps reduces by the factor + 'num_squeeze'. Defaults to 1. + sigmoid_scale (bool): + enable/disable sigmoid scaling in decoder. Defaults to False. + mean_only (bool): + If True, encoder only computes mean value and uses constant variance for each time step. Defaults to true. + encoder_type (str): + Encoder module type. Possible values are`["rel_pos_transformer", "gated_conv", "residual_conv_bn", "time_depth_separable"]` + Check `TTS.tts.layers.glow_tts.encoder` for more details. Defaults to `rel_pos_transformers` as in the original paper. + encoder_params (dict): + Encoder module parameters. Defaults to None. + d_vector_dim (int): + Channels of external speaker embedding vectors. Defaults to 0. data_dep_init_steps (int): Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses Activation Normalization that pre-computes normalization stats at the beginning and use the same values @@ -41,9 +77,9 @@ class GlowTTSConfig(BaseTTSConfig): use_speaker_embedding (bool): enable / disable using speaker embeddings for multi-speaker models. If set True, the model is in the multi-speaker mode. Defaults to False. - use_external_speaker_embedding_file (bool): + use_d_vector_file (bool): enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. - external_speaker_embedding_file (str): + d_vector_file (str): Path to the file including pre-computed speaker embeddings. Defaults to None. noam_schedule (bool): enable / disable the use of Noam LR scheduler. Defaults to False. @@ -62,6 +98,7 @@ class GlowTTSConfig(BaseTTSConfig): model: str = "glow_tts" # model params + num_chars: int = None encoder_type: str = "rel_pos_transformer" encoder_params: dict = field( default_factory=lambda: { @@ -73,9 +110,36 @@ class GlowTTSConfig(BaseTTSConfig): } ) use_encoder_prenet: bool = True - hidden_channels_encoder: int = 192 - hidden_channels_decoder: int = 192 - hidden_channels_duration_predictor: int = 256 + hidden_channels_enc: int = 192 + hidden_channels_dec: int = 192 + hidden_channels_dp: int = 256 + dropout_p_dp: float = 0.1 + dropout_p_dec: float = 0.05 + mean_only: bool = True + out_channels: int = 80 + num_flow_blocks_dec: int = 12 + inference_noise_scale: float = 0.33 + kernel_size_dec: int = 5 + dilation_rate: int = 5 + num_block_layers: int = 4 + num_speakers: int = 0 + c_in_channels: int = 0 + num_splits: int = 4 + num_squeeze: int = 1 + sigmoid_scale: bool = False + mean_only: bool = False + encoder_type: str = "rel_pos_transformer" + encoder_params: dict = field( + default_factory=lambda: { + "kernel_size": 3, + "dropout_p": 0.1, + "num_layers": 6, + "num_heads": 2, + "hidden_channels_ffn": 768, + "input_length": None, + } + ) + d_vector_dim: int = 0 # training params data_dep_init_steps: int = 10 @@ -86,8 +150,8 @@ class GlowTTSConfig(BaseTTSConfig): # multi-speaker settings use_speaker_embedding: bool = False - use_external_speaker_embedding_file: bool = False - external_speaker_embedding_file: str = False + use_d_vector_file: bool = False + d_vector_file: str = False # optimizer parameters optimizer: str = "RAdam" diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index d02e58ae..4b916a17 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -1,7 +1,7 @@ from dataclasses import asdict, dataclass, field from typing import List -from coqpit import MISSING, Coqpit, check_argument +from coqpit import Coqpit, check_argument from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig @@ -153,7 +153,7 @@ class BaseTTSConfig(BaseTrainingConfig): use_espeak_phonemes: bool = True phoneme_language: str = None compute_input_seq_cache: bool = False - text_cleaner: str = MISSING + text_cleaner: str = None enable_eos_bos_chars: bool = False test_sentences_file: str = "" phoneme_cache_path: str = None @@ -171,10 +171,14 @@ class BaseTTSConfig(BaseTrainingConfig): # dataset datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) # optimizer - optimizer: str = MISSING - optimizer_params: dict = MISSING + optimizer: str = None + optimizer_params: dict = None # scheduler lr_scheduler: str = "" lr_scheduler_params: dict = field(default_factory=lambda: {}) # testing test_sentences: List[str] = field(default_factory=lambda: []) + # multi-speaker + use_speaker_embedding: bool = False + use_d_vector_file: bool = False + d_vector_dim: int = 0 diff --git a/TTS/tts/configs/speedy_speech_config.py b/TTS/tts/configs/speedy_speech_config.py index d76d94e2..b2641ab5 100644 --- a/TTS/tts/configs/speedy_speech_config.py +++ b/TTS/tts/configs/speedy_speech_config.py @@ -2,6 +2,7 @@ from dataclasses import dataclass, field from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig +from TTS.tts.models.speedy_speech import SpeedySpeechArgs @dataclass @@ -16,30 +17,8 @@ class SpeedySpeechConfig(BaseTTSConfig): Args: model (str): Model name used for selecting the right model at initialization. Defaults to `speedy_speech`. - positional_encoding (bool): - enable / disable positional encoding applied to the encoder output. Defaults to True. - hidden_channels (int): - Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder - parameters. Defaults to 128. - encoder_type (str): - Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details. - Defaults to `residual_conv_bn`. - encoder_params (dict): - Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details. - Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1], "num_conv_blocks": 2, "num_res_blocks": 13}` - decoder_type (str): - Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details. - Defaults to `residual_conv_bn`. - decoder_params (dict): - Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details. - Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1], "num_conv_blocks": 2, "num_res_blocks": 17}` - hidden_channels_encoder (int): - Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes, - and for some encoder types internal hidden channels sizes too. Defaults to 192. - hidden_channels_decoder (int): - Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work. - hidden_channels_duration_predictor (int): - Number of layer channels of the duration predictor network. Defaults to 256 as in the original work. + model_args (Coqpit): + Model class arguments. Check `SpeedySpeechArgs` for more details. Defaults to `SpeedySpeechArgs()`. data_dep_init_steps (int): Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses Activation Normalization that pre-computes normalization stats at the beginning and use the same values @@ -47,9 +26,9 @@ class SpeedySpeechConfig(BaseTTSConfig): use_speaker_embedding (bool): enable / disable using speaker embeddings for multi-speaker models. If set True, the model is in the multi-speaker mode. Defaults to False. - use_external_speaker_embedding_file (bool): + use_d_vector_file (bool): enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. - external_speaker_embedding_file (str): + d_vector_file (str): Path to the file including pre-computed speaker embeddings. Defaults to None. noam_schedule (bool): enable / disable the use of Noam LR scheduler. Defaults to False. @@ -73,31 +52,12 @@ class SpeedySpeechConfig(BaseTTSConfig): model: str = "speedy_speech" # model specific params - positional_encoding: bool = True - hidden_channels: int = 128 - encoder_type: str = "residual_conv_bn" - encoder_params: dict = field( - default_factory=lambda: { - "kernel_size": 4, - "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1], - "num_conv_blocks": 2, - "num_res_blocks": 13, - } - ) - decoder_type: str = "residual_conv_bn" - decoder_params: dict = field( - default_factory=lambda: { - "kernel_size": 4, - "dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1], - "num_conv_blocks": 2, - "num_res_blocks": 17, - } - ) + model_args: SpeedySpeechArgs = field(default_factory=SpeedySpeechArgs) # multi-speaker settings use_speaker_embedding: bool = False - use_external_speaker_embedding_file: bool = False - external_speaker_embedding_file: str = False + use_d_vector_file: bool = False + d_vector_file: str = False # optimizer parameters optimizer: str = "RAdam" diff --git a/TTS/tts/configs/tacotron2_config.py b/TTS/tts/configs/tacotron2_config.py index ea66fae8..b622e640 100644 --- a/TTS/tts/configs/tacotron2_config.py +++ b/TTS/tts/configs/tacotron2_config.py @@ -12,107 +12,10 @@ class Tacotron2Config(TacotronConfig): >>> from TTS.tts.configs import Tacotron2Config >>> config = Tacotron2Config() - Args: - model (str): - Model name used to select the right model class to initilize. Defaults to `Tacotron2`. - use_gst (bool): - enable / disable the use of Global Style Token modules. Defaults to False. - gst (GSTConfig): - Instance of `GSTConfig` class. - gst_style_input (str): - Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and - this is not defined, the model uses a zero vector as an input. Defaults to None. - r (int): - Number of output frames that the decoder computed per iteration. Larger values makes training and inference - faster but reduces the quality of the output frames. This needs to be tuned considering your own needs. - Defaults to 1. - gradual_trainin (List[List]): - Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is - the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size. - If sets None, no gradual training is used. Defaults to None. - memory_size (int): - Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame. - Defaults to -1. - prenet_type (str): - `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the - Prenet. Defaults to `original`. - prenet_dropout (bool): - enables / disables the use of dropout in the Prenet. Defaults to True. - prenet_dropout_at_inference (bool): - enable / disable the use of dropout in the Prenet at the inference time. Defaults to False. - stopnet (bool): - enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True. - stopnet_pos_weight (float): - Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with - datasets with longer sentences. Defaults to 10. - separate_stopnet (bool): - Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True. - attention_type (str): - attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'. - attention_heads (int): - Number of attention heads for GMM attention. Defaults to 5. - windowing (bool): - It especially useful at inference to keep attention alignment diagonal. Defaults to False. - use_forward_attn (bool): - It is only valid if ```attn_type``` is ```original```. Defaults to False. - forward_attn_mask (bool): - enable/disable extra masking over forward attention. It is useful at inference to prevent - possible attention failures. Defaults to False. - transition_agent (bool): - enable/disable transition agent in forward attention. Defaults to False. - location_attn (bool): - enable/disable location sensitive attention as in the original Tacotron2 paper. - It is only valid if ```attn_type``` is ```original```. Defaults to True. - bidirectional_decoder (bool): - enable/disable bidirectional decoding. Defaults to False. - double_decoder_consistency (bool): - enable/disable double decoder consistency. Defaults to False. - ddc_r (int): - reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this - as a multiple of the `r` value. Defaults to 6. - use_speaker_embedding (bool): - enable / disable using speaker embeddings for multi-speaker models. If set True, the model is - in the multi-speaker mode. Defaults to False. - use_external_speaker_embedding_file (bool): - enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. - external_speaker_embedding_file (str): - Path to the file including pre-computed speaker embeddings. Defaults to None. - noam_schedule (bool): - enable / disable the use of Noam LR scheduler. Defaults to False. - warmup_steps (int): - Number of warm-up steps for the Noam scheduler. Defaults 4000. - lr (float): - Initial learning rate. Defaults to `1e-4`. - wd (float): - Weight decay coefficient. Defaults to `1e-6`. - grad_clip (float): - Gradient clipping threshold. Defaults to `5`. - seq_len_notm (bool): - enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample - is divided by the sequence length. Defaults to False. - loss_masking (bool): - enable / disable masking the paddings of the samples in loss computation. Defaults to True. - decoder_loss_alpha (float): - Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - postnet_loss_alpha (float): - Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - postnet_diff_spec_alpha (float): - Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - decoder_diff_spec_alpha (float): - Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - decoder_ssim_alpha (float): - Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - postnet_ssim_alpha (float): - Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - ga_alpha (float): - Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss - function. Defaults to 5. + Check `TacotronConfig` for argument descriptions. """ model: str = "tacotron2" + out_channels: int = 80 + encoder_in_features: int = 512 + decoder_in_features: int = 512 diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index 2b67901c..89fb8d81 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -23,6 +23,10 @@ class TacotronConfig(BaseTTSConfig): gst_style_input (str): Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and this is not defined, the model uses a zero vector as an input. Defaults to None. + num_chars (int): + Number of characters used by the model. It must be defined before initializing the model. Defaults to None. + num_speakers (int): + Number of speakers for multi-speaker models. Defaults to 1. r (int): Initial number of output frames that the decoder computed per iteration. Larger values makes training and inference faster but reduces the quality of the output frames. This must be equal to the largest `r` value used in @@ -47,7 +51,13 @@ class TacotronConfig(BaseTTSConfig): Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with datasets with longer sentences. Defaults to 10. max_decoder_steps (int): - Max number of steps allowed for the decoder. Defaults to 10000. + Max number of steps allowed for the decoder. Defaults to 50. + encoder_in_features (int): + Channels of encoder input and character embedding tensors. Defaults to 256. + decoder_in_features (int): + Channels of decoder input and encoder output tensors. Defaults to 256. + out_channels (int): + Channels of the final model output. It must match the spectragram size. Defaults to 80. separate_stopnet (bool): Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True. attention_type (str): @@ -76,9 +86,9 @@ class TacotronConfig(BaseTTSConfig): use_speaker_embedding (bool): enable / disable using speaker embeddings for multi-speaker models. If set True, the model is in the multi-speaker mode. Defaults to False. - use_external_speaker_embedding_file (bool): + use_d_vector_file (bool): enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. - external_speaker_embedding_file (str): + d_vector_file (str): Path to the file including pre-computed speaker embeddings. Defaults to None. optimizer (str): Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`. @@ -111,6 +121,7 @@ class TacotronConfig(BaseTTSConfig): Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the corresponding loss function. Defaults to 0.25 decoder_diff_spec_alpha (float): + Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the corresponding loss function. Defaults to 0.25 decoder_ssim_alpha (float): @@ -125,11 +136,14 @@ class TacotronConfig(BaseTTSConfig): """ model: str = "tacotron" + # model_params: TacotronArgs = field(default_factory=lambda: TacotronArgs()) use_gst: bool = False gst: GSTConfig = None gst_style_input: str = None # model specific params + num_speakers: int = 1 + num_chars: int = 0 r: int = 2 gradual_training: List[List[int]] = None memory_size: int = -1 @@ -139,12 +153,17 @@ class TacotronConfig(BaseTTSConfig): stopnet: bool = True separate_stopnet: bool = True stopnet_pos_weight: float = 10.0 - max_decoder_steps: int = 10000 + max_decoder_steps: int = 500 + encoder_in_features: int = 256 + decoder_in_features: int = 256 + decoder_output_dim: int = 80 + out_channels: int = 513 # attention layers attention_type: str = "original" attention_heads: int = None attention_norm: str = "sigmoid" + attention_win: bool = False windowing: bool = False use_forward_attn: bool = False forward_attn_mask: bool = False @@ -158,8 +177,10 @@ class TacotronConfig(BaseTTSConfig): # multi-speaker settings use_speaker_embedding: bool = False - use_external_speaker_embedding_file: bool = False - external_speaker_embedding_file: str = False + speaker_embedding_dim: int = 512 + use_d_vector_file: bool = False + d_vector_file: str = False + d_vector_dim: int = None # optimizer parameters optimizer: str = "RAdam" @@ -196,3 +217,9 @@ class TacotronConfig(BaseTTSConfig): assert ( self.gradual_training[0][1] == self.r ), f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}" + if self.model == "tacotron" and self.audio is not None: + assert self.out_channels == ( + self.audio.fft_size // 2 + 1 + ), f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}" + if self.model == "tacotron2" and self.audio is not None: + assert self.out_channels == self.audio.num_mels From a358f74a52b56ae8ce6c18adead82a32f3705662 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:29:35 +0200 Subject: [PATCH 197/258] Update vocoder model configs --- TTS/vocoder/configs/fullband_melgan_config.py | 4 +-- .../configs/multiband_melgan_config.py | 2 +- .../configs/parallel_wavegan_config.py | 2 +- TTS/vocoder/configs/shared_configs.py | 28 +++++++++--------- TTS/vocoder/configs/wavegrad_config.py | 29 +++---------------- TTS/vocoder/configs/wavernn_config.py | 29 +++++-------------- 6 files changed, 29 insertions(+), 65 deletions(-) diff --git a/TTS/vocoder/configs/fullband_melgan_config.py b/TTS/vocoder/configs/fullband_melgan_config.py index 53444214..2ab83aac 100644 --- a/TTS/vocoder/configs/fullband_melgan_config.py +++ b/TTS/vocoder/configs/fullband_melgan_config.py @@ -14,7 +14,7 @@ class FullbandMelganConfig(BaseGANVocoderConfig): Args: model (str): - Model name used for selecting the right model at initialization. Defaults to `melgan`. + Model name used for selecting the right model at initialization. Defaults to `fullband_melgan`. discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to 'melgan_multiscale_discriminator`. discriminator_model_params (dict): The discriminator model parameters. Defaults to @@ -62,7 +62,7 @@ class FullbandMelganConfig(BaseGANVocoderConfig): L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. """ - model: str = "melgan" + model: str = "fullband_melgan" # Model specific params discriminator_model: str = "melgan_multiscale_discriminator" diff --git a/TTS/vocoder/configs/multiband_melgan_config.py b/TTS/vocoder/configs/multiband_melgan_config.py index 81fd7904..76311353 100644 --- a/TTS/vocoder/configs/multiband_melgan_config.py +++ b/TTS/vocoder/configs/multiband_melgan_config.py @@ -14,7 +14,7 @@ class MultibandMelganConfig(BaseGANVocoderConfig): Args: model (str): - Model name used for selecting the right model at initialization. Defaults to `melgan`. + Model name used for selecting the right model at initialization. Defaults to `multiband_melgan`. discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to 'melgan_multiscale_discriminator`. discriminator_model_params (dict): The discriminator model parameters. Defaults to diff --git a/TTS/vocoder/configs/parallel_wavegan_config.py b/TTS/vocoder/configs/parallel_wavegan_config.py index d132d2e1..a89b1f3f 100644 --- a/TTS/vocoder/configs/parallel_wavegan_config.py +++ b/TTS/vocoder/configs/parallel_wavegan_config.py @@ -9,7 +9,7 @@ class ParallelWaveganConfig(BaseGANVocoderConfig): Args: model (str): - Model name used for selecting the right configuration at initialization. Defaults to `parallel_wavegan`. + Model name used for selecting the right configuration at initialization. Defaults to `gan`. discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to 'parallel_wavegan_discriminator`. discriminator_model_params (dict): The discriminator model kwargs. Defaults to diff --git a/TTS/vocoder/configs/shared_configs.py b/TTS/vocoder/configs/shared_configs.py index 664032d2..6891ce6c 100644 --- a/TTS/vocoder/configs/shared_configs.py +++ b/TTS/vocoder/configs/shared_configs.py @@ -34,6 +34,10 @@ class BaseVocoderConfig(BaseTrainingConfig): Number of training epochs to. Defaults to 10000. wd (float): Weight decay. + optimizer (torch.optim.Optimizer): + Optimizer used for the training. Defaults to `AdamW`. + optimizer_params (dict): + Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}` """ audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) @@ -50,6 +54,8 @@ class BaseVocoderConfig(BaseTrainingConfig): # OPTIMIZER epochs: int = 10000 # total number of epochs to train. wd: float = 0.0 # Weight decay weight. + optimizer: str = "AdamW" + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "weight_decay": 0.0}) @dataclass @@ -96,20 +102,13 @@ class BaseGANVocoderConfig(BaseVocoderConfig): }` target_loss (str): Target loss name that defines the quality of the model. Defaults to `avg_G_loss`. - gen_clip_grad (float): - Gradient clipping threshold for the generator model. Any value less than 0 disables clipping. - Defaults to -1. - disc_clip_grad (float): - Gradient clipping threshold for the discriminator model. Any value less than 0 disables clipping. - Defaults to -1. + grad_clip (list): + A list of gradient clipping theresholds for each optimizer. Any value less than 0 disables clipping. + Defaults to [5, 5]. lr_gen (float): Generator model initial learning rate. Defaults to 0.0002. lr_disc (float): Discriminator model initial learning rate. Defaults to 0.0002. - optimizer (torch.optim.Optimizer): - Optimizer used for the training. Defaults to `AdamW`. - optimizer_params (dict): - Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}` lr_scheduler_gen (torch.optim.Scheduler): Learning rate scheduler for the generator. Defaults to `ExponentialLR`. lr_scheduler_gen_params (dict): @@ -127,6 +126,8 @@ class BaseGANVocoderConfig(BaseVocoderConfig): Enabling it results in slower iterations but faster convergance in some cases. Defaults to False. """ + model: str = "gan" + # LOSS PARAMETERS use_stft_loss: bool = True use_subband_stft_loss: bool = True @@ -164,15 +165,12 @@ class BaseGANVocoderConfig(BaseVocoderConfig): } ) - target_loss: str = "avg_G_loss" # loss value to pick the best model to save after each epoch + target_loss: str = "loss_0" # loss value to pick the best model to save after each epoch # optimizer - gen_clip_grad: float = -1 # Generator gradient clipping threshold. Apply gradient clipping if > 0 - disc_clip_grad: float = -1 # Discriminator gradient clipping threshold. + grad_clip: float = field(default_factory=lambda: [5, 5]) lr_gen: float = 0.0002 # Initial learning rate. lr_disc: float = 0.0002 # Initial learning rate. - optimizer: str = "AdamW" - optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "weight_decay": 0.0}) lr_scheduler_gen: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1}) lr_scheduler_disc: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html diff --git a/TTS/vocoder/configs/wavegrad_config.py b/TTS/vocoder/configs/wavegrad_config.py index 271422ee..c39813ae 100644 --- a/TTS/vocoder/configs/wavegrad_config.py +++ b/TTS/vocoder/configs/wavegrad_config.py @@ -1,6 +1,7 @@ from dataclasses import dataclass, field from TTS.vocoder.configs.shared_configs import BaseVocoderConfig +from TTS.vocoder.models.wavegrad import WavegradArgs @dataclass @@ -16,19 +17,7 @@ class WavegradConfig(BaseVocoderConfig): Model name used for selecting the right model at initialization. Defaults to `wavegrad`. generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is considered as a generator too. Defaults to `wavegrad`. - model_params (dict): - WaveGrad kwargs. Defaults to - ` - { - "use_weight_norm": True, - "y_conv_channels": 32, - "x_conv_channels": 768, - "ublock_out_channels": [512, 512, 256, 128, 128], - "dblock_out_channels": [128, 128, 256, 512], - "upsample_factors": [4, 4, 4, 2, 2], - "upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], - } - ` + model_params (WavegradArgs): Model parameters. Check `WavegradArgs` for default values. target_loss (str): Target loss name that defines the quality of the model. Defaults to `avg_wavegrad_loss`. epochs (int): @@ -70,18 +59,8 @@ class WavegradConfig(BaseVocoderConfig): model: str = "wavegrad" # Model specific params generator_model: str = "wavegrad" - model_params: dict = field( - default_factory=lambda: { - "use_weight_norm": True, - "y_conv_channels": 32, - "x_conv_channels": 768, - "ublock_out_channels": [512, 512, 256, 128, 128], - "dblock_out_channels": [128, 128, 256, 512], - "upsample_factors": [4, 4, 4, 2, 2], - "upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], - } - ) - target_loss: str = "avg_wavegrad_loss" # loss value to pick the best model to save after each epoch + model_params: WavegradArgs = field(default_factory=WavegradArgs) + target_loss: str = "loss" # loss value to pick the best model to save after each epoch # Training - overrides epochs: int = 10000 diff --git a/TTS/vocoder/configs/wavernn_config.py b/TTS/vocoder/configs/wavernn_config.py index 95a3cfc4..0afa1f43 100644 --- a/TTS/vocoder/configs/wavernn_config.py +++ b/TTS/vocoder/configs/wavernn_config.py @@ -1,6 +1,7 @@ from dataclasses import dataclass, field from TTS.vocoder.configs.shared_configs import BaseVocoderConfig +from TTS.vocoder.models.wavernn import WavernnArgs @dataclass @@ -47,9 +48,7 @@ class WavernnConfig(BaseVocoderConfig): Batch size used at training. Larger values use more memory. Defaults to 256. seq_len (int): Audio segment length used at training. Larger values use more memory. Defaults to 1280. - padding (int): - Padding applied to the input feature frames against the convolution layers of the feature network. - Defaults to 2. + use_noise_augment (bool): enable / disable random noise added to the input waveform. The noise is added after computing the features. Defaults to True. @@ -60,7 +59,7 @@ class WavernnConfig(BaseVocoderConfig): enable / disable mixed precision training. Default is True. eval_split_size (int): Number of samples used for evalutaion. Defaults to 50. - test_every_epoch (int): + num_epochs_before_test (int): Number of epochs waited to run the next evalution. Since inference takes some time, it is better to wait some number of epochs not ot waste training time. Defaults to 10. grad_clip (float): @@ -76,21 +75,8 @@ class WavernnConfig(BaseVocoderConfig): model: str = "wavernn" # Model specific params - mode: str = "mold" # mold [string], gauss [string], bits [int] - mulaw: bool = True # apply mulaw if mode is bits - generator_model: str = "WaveRNN" - wavernn_model_params: dict = field( - default_factory=lambda: { - "rnn_dims": 512, - "fc_dims": 512, - "compute_dims": 128, - "res_out_dims": 128, - "num_res_blocks": 10, - "use_aux_net": True, - "use_upsample_net": True, - "upsample_factors": [4, 8, 8], # this needs to correctly factorise hop_length - } - ) + model_params: WavernnArgs = field(default_factory=WavernnArgs) + target_loss: str = "loss" # Inference batched: bool = True @@ -101,12 +87,13 @@ class WavernnConfig(BaseVocoderConfig): epochs: int = 10000 batch_size: int = 256 seq_len: int = 1280 - padding: int = 2 use_noise_augment: bool = False use_cache: bool = True mixed_precision: bool = True eval_split_size: int = 50 - test_every_epochs: int = 10 # number of epochs to wait until the next test run (synthesizing a full audio clip). + num_epochs_before_test: int = ( + 10 # number of epochs to wait until the next test run (synthesizing a full audio clip). + ) # optimizer overrides grad_clip: float = 4.0 From 7b8c15ac4915e03936089b352e723256e5baacec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:35:36 +0200 Subject: [PATCH 198/258] =?UTF-8?q?Create=20base=20=F0=9F=90=B8TTS=20model?= =?UTF-8?q?=20abstraction=20for=20tts=20models?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TTS/{tts/models/abstract_tts.py => model.py} | 41 ++- TTS/tts/models/align_tts.py | 159 +++++++---- TTS/tts/models/base_tacotron.py | 286 +++++++++++++++++++ TTS/tts/models/base_tts.py | 233 +++++++++++++++ TTS/tts/models/glow_tts.py | 155 ++++------ TTS/tts/models/speedy_speech.py | 147 +++++++--- TTS/tts/models/tacotron.py | 216 +++++--------- TTS/tts/models/tacotron2.py | 206 +++++-------- TTS/tts/tf/models/tacotron2.py | 6 +- TTS/vocoder/models/base_vocoder.py | 20 ++ 10 files changed, 968 insertions(+), 501 deletions(-) rename TTS/{tts/models/abstract_tts.py => model.py} (86%) create mode 100644 TTS/tts/models/base_tacotron.py create mode 100644 TTS/tts/models/base_tts.py create mode 100644 TTS/vocoder/models/base_vocoder.py diff --git a/TTS/tts/models/abstract_tts.py b/TTS/model.py similarity index 86% rename from TTS/tts/models/abstract_tts.py rename to TTS/model.py index 9132f7eb..aefb925e 100644 --- a/TTS/tts/models/abstract_tts.py +++ b/TTS/model.py @@ -1,9 +1,9 @@ -from coqpit import Coqpit from abc import ABC, abstractmethod -from typing import Dict, Tuple +from typing import Dict, List, Tuple, Union import numpy as np import torch +from coqpit import Coqpit from torch import nn from TTS.utils.audio import AudioProcessor @@ -11,8 +11,8 @@ from TTS.utils.audio import AudioProcessor # pylint: skip-file -class TTSModel(nn.Module, ABC): - """Abstract TTS class. Every new `tts` model must inherit this. +class BaseModel(nn.Module, ABC): + """Abstract 🐸TTS class. Every new 🐸TTS model must inherit this. Notes on input/output tensor shapes: Any input or output tensor of the model must be shaped as @@ -77,7 +77,6 @@ class TTSModel(nn.Module, ABC): ... return outputs_dict, loss_dict - @abstractmethod def train_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: """Create visualizations and waveform examples for training. @@ -92,10 +91,7 @@ class TTSModel(nn.Module, ABC): Returns: Tuple[Dict, np.ndarray]: training plots and output waveform. """ - figures_dict = {} - output_wav = np.array() - ... - return figures_dict, output_wav + return None, None @abstractmethod def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: @@ -114,13 +110,9 @@ class TTSModel(nn.Module, ABC): ... return outputs_dict, loss_dict - @abstractmethod def eval_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: """The same as `train_log()`""" - figures_dict = {} - output_wav = np.array() - ... - return figures_dict, output_wav + return None, None @abstractmethod def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False) -> None: @@ -132,3 +124,24 @@ class TTSModel(nn.Module, ABC): eval (bool, optional): If true, init model for inference else for training. Defaults to False. """ ... + + def get_optimizer(self) -> Union["Optimizer", List["Optimizer"]]: + """Setup an return optimizer or optimizers.""" + pass + + def get_lr(self) -> Union[float, List[float]]: + """Return learning rate(s). + + Returns: + Union[float, List[float]]: Model's initial learning rates. + """ + pass + + def get_scheduler(self, optimizer: torch.optim.Optimizer): + pass + + def get_criterion(self): + pass + + def format_batch(self): + pass diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 75fb50de..dbd57b83 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -1,5 +1,9 @@ +from dataclasses import dataclass, field +from typing import Dict, Tuple + import torch import torch.nn as nn +from coqpit import Coqpit from TTS.tts.layers.align_tts.mdn import MDNBlock from TTS.tts.layers.feed_forward.decoder import Decoder @@ -7,36 +11,16 @@ from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path -from TTS.tts.models.abstract_tts import TTSModel +from TTS.tts.models.base_tts import BaseTTS from TTS.tts.utils.data import sequence_mask from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor -class AlignTTS(TTSModel): - """AlignTTS with modified duration predictor. - https://arxiv.org/pdf/2003.01950.pdf - - Encoder -> DurationPredictor -> Decoder - - AlignTTS's Abstract - Targeting at both high efficiency and performance, we propose AlignTTS to predict the - mel-spectrum in parallel. AlignTTS is based on a Feed-Forward Transformer which generates mel-spectrum from a - sequence of characters, and the duration of each character is determined by a duration predictor.Instead of - adopting the attention mechanism in Transformer TTS to align text to mel-spectrum, the alignment loss is presented - to consider all possible alignments in training by use of dynamic programming. Experiments on the LJSpeech dataset s - how that our model achieves not only state-of-the-art performance which outperforms Transformer TTS by 0.03 in mean - option score (MOS), but also a high efficiency which is more than 50 times faster than real-time. - - Note: - Original model uses a separate character embedding layer for duration predictor. However, it causes the - duration predictor to overfit and prevents learning higher level interactions among characters. Therefore, - we predict durations based on encoder outputs which has higher level information about input characters. This - enables training without phases as in the original paper. - - Original model uses Transormers in encoder and decoder layers. However, here you can set the architecture - differently based on your requirements using ```encoder_type``` and ```decoder_type``` parameters. - +@dataclass +class AlignTTSArgs(Coqpit): + """ Args: num_chars (int): number of unique input to characters @@ -64,43 +48,98 @@ class AlignTTS(TTSModel): number of channels in speaker embedding vectors. Defaults to 0. """ + num_chars: int = None + out_channels: int = 80 + hidden_channels: int = 256 + hidden_channels_dp: int = 256 + encoder_type: str = "fftransformer" + encoder_params: dict = field( + default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1} + ) + decoder_type: str = "fftransformer" + decoder_params: dict = field( + default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1} + ) + length_scale: float = 1.0 + num_speakers: int = 0 + use_speaker_embedding: bool = False + use_d_vector_file: bool = False + d_vector_dim: int = 0 + + +class AlignTTS(BaseTTS): + """AlignTTS with modified duration predictor. + https://arxiv.org/pdf/2003.01950.pdf + + Encoder -> DurationPredictor -> Decoder + + Check ```AlignTTSArgs``` for the class arguments. + + Examples: + >>> from TTS.tts.configs import AlignTTSConfig + >>> config = AlignTTSConfig() + >>> config.model_args.num_chars = 50 + >>> model = AlignTTS(config) + + Paper Abstract: + Targeting at both high efficiency and performance, we propose AlignTTS to predict the + mel-spectrum in parallel. AlignTTS is based on a Feed-Forward Transformer which generates mel-spectrum from a + sequence of characters, and the duration of each character is determined by a duration predictor.Instead of + adopting the attention mechanism in Transformer TTS to align text to mel-spectrum, the alignment loss is presented + to consider all possible alignments in training by use of dynamic programming. Experiments on the LJSpeech dataset s + how that our model achieves not only state-of-the-art performance which outperforms Transformer TTS by 0.03 in mean + option score (MOS), but also a high efficiency which is more than 50 times faster than real-time. + + Note: + Original model uses a separate character embedding layer for duration predictor. However, it causes the + duration predictor to overfit and prevents learning higher level interactions among characters. Therefore, + we predict durations based on encoder outputs which has higher level information about input characters. This + enables training without phases as in the original paper. + + Original model uses Transormers in encoder and decoder layers. However, here you can set the architecture + differently based on your requirements using ```encoder_type``` and ```decoder_type``` parameters. + + """ + # pylint: disable=dangerous-default-value - def __init__( - self, - num_chars, - out_channels, - hidden_channels=256, - hidden_channels_dp=256, - encoder_type="fftransformer", - encoder_params={"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}, - decoder_type="fftransformer", - decoder_params={"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}, - length_scale=1, - num_speakers=0, - external_c=False, - c_in_channels=0, - ): + def __init__(self, config: Coqpit): super().__init__() + self.config = config self.phase = -1 - self.length_scale = float(length_scale) if isinstance(length_scale, int) else length_scale - self.emb = nn.Embedding(num_chars, hidden_channels) - self.pos_encoder = PositionalEncoding(hidden_channels) - self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, encoder_params, c_in_channels) - self.decoder = Decoder(out_channels, hidden_channels, decoder_type, decoder_params) - self.duration_predictor = DurationPredictor(hidden_channels_dp) + self.length_scale = ( + float(config.model_args.length_scale) + if isinstance(config.model_args.length_scale, int) + else config.model_args.length_scale + ) + self.emb = nn.Embedding(self.config.model_args.num_chars, self.config.model_args.hidden_channels) - self.mod_layer = nn.Conv1d(hidden_channels, hidden_channels, 1) - self.mdn_block = MDNBlock(hidden_channels, 2 * out_channels) + self.embedded_speaker_dim = 0 + self.init_multispeaker(config) - if num_speakers > 1 and not external_c: - # speaker embedding layer - self.emb_g = nn.Embedding(num_speakers, c_in_channels) - nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) + self.pos_encoder = PositionalEncoding(config.model_args.hidden_channels) + self.encoder = Encoder( + config.model_args.hidden_channels, + config.model_args.hidden_channels, + config.model_args.encoder_type, + config.model_args.encoder_params, + self.embedded_speaker_dim, + ) + self.decoder = Decoder( + config.model_args.out_channels, + config.model_args.hidden_channels, + config.model_args.decoder_type, + config.model_args.decoder_params, + ) + self.duration_predictor = DurationPredictor(config.model_args.hidden_channels_dp) - if c_in_channels > 0 and c_in_channels != hidden_channels: - self.proj_g = nn.Conv1d(c_in_channels, hidden_channels, 1) + self.mod_layer = nn.Conv1d(config.model_args.hidden_channels, config.model_args.hidden_channels, 1) + + self.mdn_block = MDNBlock(config.model_args.hidden_channels, 2 * config.model_args.out_channels) + + if self.embedded_speaker_dim > 0 and self.embedded_speaker_dim != config.model_args.hidden_channels: + self.proj_g = nn.Conv1d(self.embedded_speaker_dim, config.model_args.hidden_channels, 1) @staticmethod def compute_log_probs(mu, log_sigma, y): @@ -164,11 +203,12 @@ class AlignTTS(TTSModel): # project g to decoder dim. if hasattr(self, "proj_g"): g = self.proj_g(g) + return x + g def _forward_encoder(self, x, x_lengths, g=None): if hasattr(self, "emb_g"): - g = nn.functional.normalize(self.emb_g(g)) # [B, C, 1] + g = nn.functional.normalize(self.speaker_embedding(g)) # [B, C, 1] if g is not None: g = g.unsqueeze(-1) @@ -315,7 +355,9 @@ class AlignTTS(TTSModel): loss_dict["align_error"] = align_error return outputs, loss_dict - def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): # pylint: disable=no-self-use + def train_log( + self, ap: AudioProcessor, batch: dict, outputs: dict + ) -> Tuple[Dict, Dict]: # pylint: disable=no-self-use model_outputs = outputs["model_outputs"] alignments = outputs["alignments"] mel_input = batch["mel_input"] @@ -332,7 +374,7 @@ class AlignTTS(TTSModel): # Sample audio train_audio = ap.inv_melspectrogram(pred_spec.T) - return figures, train_audio + return figures, {"audio": train_audio} def eval_step(self, batch: dict, criterion: nn.Module): return self.train_step(batch, criterion) @@ -349,6 +391,11 @@ class AlignTTS(TTSModel): self.eval() assert not self.training + def get_criterion(self): + from TTS.tts.layers.losses import AlignTTSLoss # pylint: disable=import-outside-toplevel + + return AlignTTSLoss(self.config) + @staticmethod def _set_phase(config, global_step): """Decide AlignTTS training phase""" diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py new file mode 100644 index 00000000..a99e1926 --- /dev/null +++ b/TTS/tts/models/base_tacotron.py @@ -0,0 +1,286 @@ +import copy +from abc import abstractmethod +from dataclasses import dataclass +from typing import Dict, List + +import torch +from coqpit import MISSING, Coqpit +from torch import nn + +from TTS.tts.layers.losses import TacotronLoss +from TTS.tts.models.base_tts import BaseTTS +from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager +from TTS.tts.utils.text import make_symbols +from TTS.utils.generic_utils import format_aux_input +from TTS.utils.training import gradual_training_scheduler + + +@dataclass +class BaseTacotronArgs(Coqpit): + """TODO: update Tacotron configs using it""" + + num_chars: int = MISSING + num_speakers: int = MISSING + r: int = MISSING + out_channels: int = 80 + decoder_output_dim: int = 80 + attn_type: str = "original" + attn_win: bool = False + attn_norm: str = "softmax" + prenet_type: str = "original" + prenet_dropout: bool = True + prenet_dropout_at_inference: bool = False + forward_attn: bool = False + trans_agent: bool = False + forward_attn_mask: bool = False + location_attn: bool = True + attn_K: int = 5 + separate_stopnet: bool = True + bidirectional_decoder: bool = False + double_decoder_consistency: bool = False + ddc_r: int = None + encoder_in_features: int = 512 + decoder_in_features: int = 512 + d_vector_dim: int = None + use_gst: bool = False + gst: bool = None + gradual_training: bool = None + + +class BaseTacotron(BaseTTS): + def __init__(self, config: Coqpit): + """Abstract Tacotron class""" + super().__init__() + + for key in config: + setattr(self, key, config[key]) + + # layers + self.embedding = None + self.encoder = None + self.decoder = None + self.postnet = None + + # init tensors + self.embedded_speakers = None + self.embedded_speakers_projected = None + + # global style token + if self.gst and self.use_gst: + self.decoder_in_features += self.gst.gst_embedding_dim # add gst embedding dim + self.gst_layer = None + + # additional layers + self.decoder_backward = None + self.coarse_decoder = None + + # init multi-speaker layers + self.init_multispeaker(config) + + @staticmethod + def _format_aux_input(aux_input: Dict) -> Dict: + return format_aux_input({"d_vectors": None, "speaker_ids": None}, aux_input) + + ############################# + # INIT FUNCTIONS + ############################# + + def _init_states(self): + self.embedded_speakers = None + self.embedded_speakers_projected = None + + def _init_backward_decoder(self): + self.decoder_backward = copy.deepcopy(self.decoder) + + def _init_coarse_decoder(self): + self.coarse_decoder = copy.deepcopy(self.decoder) + self.coarse_decoder.r_init = self.ddc_r + self.coarse_decoder.set_r(self.ddc_r) + + ############################# + # CORE FUNCTIONS + ############################# + + @abstractmethod + def forward(self): + pass + + @abstractmethod + def inference(self): + pass + + def load_checkpoint( + self, config, checkpoint_path, eval=False + ): # pylint: disable=unused-argument, redefined-builtin + state = torch.load(checkpoint_path, map_location=torch.device("cpu")) + self.load_state_dict(state["model"]) + if "r" in state: + self.decoder.set_r(state["r"]) + else: + self.decoder.set_r(state["config"]["r"]) + if eval: + self.eval() + assert not self.training + + def get_criterion(self) -> nn.Module: + return TacotronLoss(self.config) + + @staticmethod + def get_characters(config: Coqpit) -> str: + # TODO: implement CharacterProcessor + if config.characters is not None: + symbols, phonemes = make_symbols(**config.characters) + else: + from TTS.tts.utils.text.symbols import ( # pylint: disable=import-outside-toplevel + parse_symbols, + phonemes, + symbols, + ) + + config.characters = parse_symbols() + model_characters = phonemes if config.use_phonemes else symbols + return model_characters, config + + @staticmethod + def get_speaker_manager(config: Coqpit, restore_path: str, data: List, out_path: str = None) -> SpeakerManager: + return get_speaker_manager(config, restore_path, data, out_path) + + def get_aux_input(self, **kwargs) -> Dict: + """Compute Tacotron's auxiliary inputs based on model config. + - speaker d_vector + - style wav for GST + - speaker ID for speaker embedding + """ + # setup speaker_id + if self.config.use_speaker_embedding: + speaker_id = kwargs.get("speaker_id", 0) + else: + speaker_id = None + # setup d_vector + d_vector = ( + self.speaker_manager.get_d_vectors_by_speaker(self.speaker_manager.speaker_names[0]) + if self.config.use_d_vector_file and self.config.use_speaker_embedding + else None + ) + # setup style_mel + if "style_wav" in kwargs: + style_wav = kwargs["style_wav"] + elif self.config.has("gst_style_input"): + style_wav = self.config.gst_style_input + else: + style_wav = None + if style_wav is None and "use_gst" in self.config and self.config.use_gst: + # inicialize GST with zero dict. + style_wav = {} + print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") + for i in range(self.config.gst["gst_num_style_tokens"]): + style_wav[str(i)] = 0 + aux_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector} + return aux_inputs + + ############################# + # COMMON COMPUTE FUNCTIONS + ############################# + + def compute_masks(self, text_lengths, mel_lengths): + """Compute masks against sequence paddings.""" + # B x T_in_max (boolean) + input_mask = sequence_mask(text_lengths) + output_mask = None + if mel_lengths is not None: + max_len = mel_lengths.max() + r = self.decoder.r + max_len = max_len + (r - (max_len % r)) if max_len % r > 0 else max_len + output_mask = sequence_mask(mel_lengths, max_len=max_len) + return input_mask, output_mask + + def _backward_pass(self, mel_specs, encoder_outputs, mask): + """Run backwards decoder""" + decoder_outputs_b, alignments_b, _ = self.decoder_backward( + encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask + ) + decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous() + return decoder_outputs_b, alignments_b + + def _coarse_decoder_pass(self, mel_specs, encoder_outputs, alignments, input_mask): + """Double Decoder Consistency""" + T = mel_specs.shape[1] + if T % self.coarse_decoder.r > 0: + padding_size = self.coarse_decoder.r - (T % self.coarse_decoder.r) + mel_specs = torch.nn.functional.pad(mel_specs, (0, 0, 0, padding_size, 0, 0)) + decoder_outputs_backward, alignments_backward, _ = self.coarse_decoder( + encoder_outputs.detach(), mel_specs, input_mask + ) + # scale_factor = self.decoder.r_init / self.decoder.r + alignments_backward = torch.nn.functional.interpolate( + alignments_backward.transpose(1, 2), size=alignments.shape[1], mode="nearest" + ).transpose(1, 2) + decoder_outputs_backward = decoder_outputs_backward.transpose(1, 2) + decoder_outputs_backward = decoder_outputs_backward[:, :T, :] + return decoder_outputs_backward, alignments_backward + + ############################# + # EMBEDDING FUNCTIONS + ############################# + + def compute_speaker_embedding(self, speaker_ids): + """Compute speaker embedding vectors""" + if hasattr(self, "speaker_embedding") and speaker_ids is None: + raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided") + if hasattr(self, "speaker_embedding") and speaker_ids is not None: + self.embedded_speakers = self.speaker_embedding(speaker_ids).unsqueeze(1) + if hasattr(self, "speaker_project_mel") and speaker_ids is not None: + self.embedded_speakers_projected = self.speaker_project_mel(self.embedded_speakers).squeeze(1) + + def compute_gst(self, inputs, style_input, speaker_embedding=None): + """Compute global style token""" + if isinstance(style_input, dict): + query = torch.zeros(1, 1, self.gst.gst_embedding_dim // 2).type_as(inputs) + if speaker_embedding is not None: + query = torch.cat([query, speaker_embedding.reshape(1, 1, -1)], dim=-1) + + _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) + gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).type_as(inputs) + for k_token, v_amplifier in style_input.items(): + key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) + gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) + gst_outputs = gst_outputs + gst_outputs_att * v_amplifier + elif style_input is None: + gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).type_as(inputs) + else: + gst_outputs = self.gst_layer(style_input, speaker_embedding) # pylint: disable=not-callable + inputs = self._concat_speaker_embedding(inputs, gst_outputs) + return inputs + + @staticmethod + def _add_speaker_embedding(outputs, embedded_speakers): + embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1) + outputs = outputs + embedded_speakers_ + return outputs + + @staticmethod + def _concat_speaker_embedding(outputs, embedded_speakers): + embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1) + outputs = torch.cat([outputs, embedded_speakers_], dim=-1) + return outputs + + ############################# + # CALLBACKS + ############################# + + def on_epoch_start(self, trainer): + """Callback for setting values wrt gradual training schedule. + + Args: + trainer (TrainerTTS): TTS trainer object that is used to train this model. + """ + if self.gradual_training: + r, trainer.config.batch_size = gradual_training_scheduler(trainer.total_steps_done, trainer.config) + trainer.config.r = r + self.decoder.set_r(r) + if trainer.config.bidirectional_decoder: + trainer.model.decoder_backward.set_r(r) + trainer.train_loader = trainer.setup_train_dataloader(self.ap, self.model.decoder.r, verbose=True) + trainer.eval_loader = trainer.setup_eval_dataloder(self.ap, self.model.decoder.r) + print(f"\n > Number of output frames: {self.decoder.r}") diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py new file mode 100644 index 00000000..1de7ba92 --- /dev/null +++ b/TTS/tts/models/base_tts.py @@ -0,0 +1,233 @@ +from typing import Dict, List, Tuple + +import numpy as np +import torch +from coqpit import Coqpit +from torch import nn +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler + +from TTS.model import BaseModel +from TTS.tts.datasets import TTSDataset +from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager +from TTS.tts.utils.synthesis import synthesis +from TTS.tts.utils.text import make_symbols +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor + +# pylint: skip-file + + +class BaseTTS(BaseModel): + """Abstract `tts` class. Every new `tts` model must inherit this. + + It defines `tts` specific functions on top of `Model`. + + Notes on input/output tensor shapes: + Any input or output tensor of the model must be shaped as + + - 3D tensors `batch x time x channels` + - 2D tensors `batch x channels` + - 1D tensors `batch x 1` + """ + + @staticmethod + def get_characters(config: Coqpit) -> str: + # TODO: implement CharacterProcessor + if config.characters is not None: + symbols, phonemes = make_symbols(**config.characters) + else: + from TTS.tts.utils.text.symbols import parse_symbols, phonemes, symbols + + config.characters = parse_symbols() + model_characters = phonemes if config.use_phonemes else symbols + return model_characters, config + + def get_speaker_manager(config: Coqpit, restore_path: str, data: List, out_path: str = None) -> SpeakerManager: + return get_speaker_manager(config, restore_path, data, out_path) + + def init_multispeaker(self, config: Coqpit, data: List = None): + """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer + or with external `d_vectors` computed from a speaker encoder model. + + If you need a different behaviour, override this function for your model. + + Args: + config (Coqpit): Model configuration. + data (List, optional): Dataset items to infer number of speakers. Defaults to None. + """ + # init speaker manager + self.speaker_manager = get_speaker_manager(config, data=data) + self.num_speakers = self.speaker_manager.num_speakers + # init speaker embedding layer + if config.use_speaker_embedding and not config.use_d_vector_file: + self.embedded_speaker_dim = ( + config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512 + ) + self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) + self.speaker_embedding.weight.data.normal_(0, 0.3) + + def get_aux_input(self, **kwargs) -> Dict: + """Prepare and return `aux_input` used by `forward()`""" + pass + + def format_batch(self, batch: Dict) -> Dict: + """Generic batch formatting for `TTSDataset`. + + You must override this if you use a custom dataset. + + Args: + batch (Dict): [description] + + Returns: + Dict: [description] + """ + # setup input batch + text_input = batch[0] + text_lengths = batch[1] + speaker_names = batch[2] + linear_input = batch[3] if self.config.model.lower() in ["tacotron"] else None + mel_input = batch[4] + mel_lengths = batch[5] + stop_targets = batch[6] + item_idx = batch[7] + d_vectors = batch[8] + speaker_ids = batch[9] + attn_mask = batch[10] + max_text_length = torch.max(text_lengths.float()) + max_spec_length = torch.max(mel_lengths.float()) + + # compute durations from attention masks + durations = None + if attn_mask is not None: + durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2]) + for idx, am in enumerate(attn_mask): + # compute raw durations + c_idxs = am[:, : text_lengths[idx], : mel_lengths[idx]].max(1)[1] + # c_idxs, counts = torch.unique_consecutive(c_idxs, return_counts=True) + c_idxs, counts = torch.unique(c_idxs, return_counts=True) + dur = torch.ones([text_lengths[idx]]).to(counts.dtype) + dur[c_idxs] = counts + # smooth the durations and set any 0 duration to 1 + # by cutting off from the largest duration indeces. + extra_frames = dur.sum() - mel_lengths[idx] + largest_idxs = torch.argsort(-dur)[:extra_frames] + dur[largest_idxs] -= 1 + assert ( + dur.sum() == mel_lengths[idx] + ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + durations[idx, : text_lengths[idx]] = dur + + # set stop targets view, we predict a single stop token per iteration. + stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // self.config.r, -1) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) + + return { + "text_input": text_input, + "text_lengths": text_lengths, + "speaker_names": speaker_names, + "mel_input": mel_input, + "mel_lengths": mel_lengths, + "linear_input": linear_input, + "stop_targets": stop_targets, + "attn_mask": attn_mask, + "durations": durations, + "speaker_ids": speaker_ids, + "d_vectors": d_vectors, + "max_text_length": float(max_text_length), + "max_spec_length": float(max_spec_length), + "item_idx": item_idx, + } + + def get_data_loader( + self, config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: List, verbose: bool, num_gpus: int + ) -> "DataLoader": + if is_eval and not config.run_eval: + loader = None + else: + # setup multi-speaker attributes + if hasattr(self, "speaker_manager"): + speaker_id_mapping = self.speaker_manager.speaker_ids if config.use_speaker_embedding else None + d_vector_mapping = ( + self.speaker_manager.d_vectors + if config.use_speaker_embedding and config.use_d_vector_file + else None + ) + else: + speaker_id_mapping = None + d_vector_mapping = None + + # init dataloader + dataset = TTSDataset( + outputs_per_step=config.r if "r" in config else 1, + text_cleaner=config.text_cleaner, + compute_linear_spec=config.model.lower() == "tacotron", + meta_data=data_items, + ap=ap, + tp=config.characters, + add_blank=config["add_blank"], + batch_group_size=0 if is_eval else config.batch_group_size * config.batch_size, + min_seq_len=config.min_seq_len, + max_seq_len=config.max_seq_len, + phoneme_cache_path=config.phoneme_cache_path, + use_phonemes=config.use_phonemes, + phoneme_language=config.phoneme_language, + enable_eos_bos=config.enable_eos_bos_chars, + use_noise_augment=not is_eval, + verbose=verbose, + speaker_id_mapping=speaker_id_mapping, + d_vector_mapping=d_vector_mapping + if config.use_speaker_embedding and config.use_d_vector_file + else None, + ) + + if config.use_phonemes and config.compute_input_seq_cache: + # precompute phonemes to have a better estimate of sequence lengths. + dataset.compute_input_seq(config.num_loader_workers) + dataset.sort_items() + + sampler = DistributedSampler(dataset) if num_gpus > 1 else None + loader = DataLoader( + dataset, + batch_size=config.eval_batch_size if is_eval else config.batch_size, + shuffle=False, + collate_fn=dataset.collate_fn, + drop_last=False, + sampler=sampler, + num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers, + pin_memory=False, + ) + return loader + + def test_run(self) -> Tuple[Dict, Dict]: + """Generic test run for `tts` models used by `Trainer`. + + You can override this for a different behaviour. + + Returns: + Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard. + """ + print(" | > Synthesizing test sentences.") + test_audios = {} + test_figures = {} + test_sentences = self.config.test_sentences + aux_inputs = self._get_aux_inputs() + for idx, sen in enumerate(test_sentences): + wav, alignment, model_outputs, _ = synthesis( + self.model, + sen, + self.config, + self.use_cuda, + self.ap, + speaker_id=aux_inputs["speaker_id"], + d_vector=aux_inputs["d_vector"], + style_wav=aux_inputs["style_wav"], + enable_eos_bos_chars=self.config.enable_eos_bos_chars, + use_griffin_lim=True, + do_trim_silence=False, + ).values() + + test_audios["{}-audio".format(idx)] = wav + test_figures["{}-prediction".format(idx)] = plot_spectrogram(model_outputs, self.ap, output_fig=False) + test_figures["{}-alignment".format(idx)] = plot_alignment(alignment, output_fig=False) + return test_figures, test_audios diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index a30eadb4..ca2682dc 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -4,131 +4,89 @@ import torch from torch import nn from torch.nn import functional as F +from TTS.tts.configs import GlowTTSConfig from TTS.tts.layers.glow_tts.decoder import Decoder from TTS.tts.layers.glow_tts.encoder import Encoder from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path -from TTS.tts.models.abstract_tts import TTSModel +from TTS.tts.models.base_tts import BaseTTS from TTS.tts.utils.data import sequence_mask from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor -class GlowTTS(TTSModel): +class GlowTTS(BaseTTS): """Glow TTS models from https://arxiv.org/abs/2005.11129 - Args: - num_chars (int): number of embedding characters. - hidden_channels_enc (int): number of embedding and encoder channels. - hidden_channels_dec (int): number of decoder channels. - use_encoder_prenet (bool): enable/disable prenet for encoder. Prenet modules are hard-coded for each alternative encoder. - hidden_channels_dp (int): number of duration predictor channels. - out_channels (int): number of output channels. It should be equal to the number of spectrogram filter. - num_flow_blocks_dec (int): number of decoder blocks. - kernel_size_dec (int): decoder kernel size. - dilation_rate (int): rate to increase dilation by each layer in a decoder block. - num_block_layers (int): number of decoder layers in each decoder block. - dropout_p_dec (float): dropout rate for decoder. - num_speaker (int): number of speaker to define the size of speaker embedding layer. - c_in_channels (int): number of speaker embedding channels. It is set to 512 if embeddings are learned. - num_splits (int): number of split levels in inversible conv1x1 operation. - num_squeeze (int): number of squeeze levels. When squeezing channels increases and time steps reduces by the factor 'num_squeeze'. - sigmoid_scale (bool): enable/disable sigmoid scaling in decoder. - mean_only (bool): if True, encoder only computes mean value and uses constant variance for each time step. - encoder_type (str): encoder module type. - encoder_params (dict): encoder module parameters. - d_vector_dim (int): channels of external speaker embedding vectors. + Paper abstract: + Recently, text-to-speech (TTS) models such as FastSpeech and ParaNet have been proposed to generate + mel-spectrograms from text in parallel. Despite the advantage, the parallel TTS models cannot be trained + without guidance from autoregressive TTS models as their external aligners. In this work, we propose Glow-TTS, + a flow-based generative model for parallel TTS that does not require any external aligner. By combining the + properties of flows and dynamic programming, the proposed model searches for the most probable monotonic + alignment between text and the latent representation of speech on its own. We demonstrate that enforcing hard + monotonic alignments enables robust TTS, which generalizes to long utterances, and employing generative flows + enables fast, diverse, and controllable speech synthesis. Glow-TTS obtains an order-of-magnitude speed-up over + the autoregressive model, Tacotron 2, at synthesis with comparable speech quality. We further show that our + model can be easily extended to a multi-speaker setting. + + Check `GlowTTSConfig` for class arguments. """ - def __init__( - self, - num_chars, - hidden_channels_enc, - hidden_channels_dec, - use_encoder_prenet, - hidden_channels_dp, - out_channels, - num_flow_blocks_dec=12, - inference_noise_scale=0.33, - kernel_size_dec=5, - dilation_rate=5, - num_block_layers=4, - dropout_p_dp=0.1, - dropout_p_dec=0.05, - num_speakers=0, - c_in_channels=0, - num_splits=4, - num_squeeze=1, - sigmoid_scale=False, - mean_only=False, - encoder_type="transformer", - encoder_params=None, - d_vector_dim=None, - ): + def __init__(self, config: GlowTTSConfig): super().__init__() - self.num_chars = num_chars - self.hidden_channels_dp = hidden_channels_dp - self.hidden_channels_enc = hidden_channels_enc - self.hidden_channels_dec = hidden_channels_dec - self.out_channels = out_channels - self.num_flow_blocks_dec = num_flow_blocks_dec - self.kernel_size_dec = kernel_size_dec - self.dilation_rate = dilation_rate - self.num_block_layers = num_block_layers - self.dropout_p_dec = dropout_p_dec - self.num_speakers = num_speakers - self.c_in_channels = c_in_channels - self.num_splits = num_splits - self.num_squeeze = num_squeeze - self.sigmoid_scale = sigmoid_scale - self.mean_only = mean_only - self.use_encoder_prenet = use_encoder_prenet - self.inference_noise_scale = inference_noise_scale - # model constants. - self.noise_scale = 0.33 # defines the noise variance applied to the random z vector at inference. - self.length_scale = 1.0 # scaler for the duration predictor. The larger it is, the slower the speech. - self.d_vector_dim = d_vector_dim + chars, self.config = self.get_characters(config) + self.num_chars = len(chars) + self.decoder_output_dim = config.out_channels + self.init_multispeaker(config) + + # pass all config fields to `self` + # for fewer code change + self.config = config + for key in config: + setattr(self, key, config[key]) # if is a multispeaker and c_in_channels is 0, set to 256 - if num_speakers > 1: - if self.c_in_channels == 0 and not self.d_vector_dim: + self.c_in_channels = 0 + if self.num_speakers > 1: + if self.d_vector_dim: + self.c_in_channels = self.d_vector_dim + elif self.c_in_channels == 0 and not self.d_vector_dim: # TODO: make this adjustable self.c_in_channels = 256 - elif self.d_vector_dim: - self.c_in_channels = self.d_vector_dim self.encoder = Encoder( - num_chars, - out_channels=out_channels, - hidden_channels=hidden_channels_enc, - hidden_channels_dp=hidden_channels_dp, - encoder_type=encoder_type, - encoder_params=encoder_params, - mean_only=mean_only, - use_prenet=use_encoder_prenet, - dropout_p_dp=dropout_p_dp, + self.num_chars, + out_channels=self.out_channels, + hidden_channels=self.hidden_channels_enc, + hidden_channels_dp=self.hidden_channels_dp, + encoder_type=self.encoder_type, + encoder_params=self.encoder_params, + mean_only=self.mean_only, + use_prenet=self.use_encoder_prenet, + dropout_p_dp=self.dropout_p_dp, c_in_channels=self.c_in_channels, ) self.decoder = Decoder( - out_channels, - hidden_channels_dec, - kernel_size_dec, - dilation_rate, - num_flow_blocks_dec, - num_block_layers, - dropout_p=dropout_p_dec, - num_splits=num_splits, - num_squeeze=num_squeeze, - sigmoid_scale=sigmoid_scale, + self.out_channels, + self.hidden_channels_dec, + self.kernel_size_dec, + self.dilation_rate, + self.num_flow_blocks_dec, + self.num_block_layers, + dropout_p=self.dropout_p_dec, + num_splits=self.num_splits, + num_squeeze=self.num_squeeze, + sigmoid_scale=self.sigmoid_scale, c_in_channels=self.c_in_channels, ) - if num_speakers > 1 and not d_vector_dim: + if self.num_speakers > 1 and not self.d_vector_dim: # speaker embedding layer - self.emb_g = nn.Embedding(num_speakers, self.c_in_channels) + self.emb_g = nn.Embedding(self.num_speakers, self.c_in_channels) nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) @staticmethod @@ -377,7 +335,7 @@ class GlowTTS(TTSModel): # Sample audio train_audio = ap.inv_melspectrogram(pred_spec.T) - return figures, train_audio + return figures, {"audio": train_audio} def eval_step(self, batch: dict, criterion: nn.Module): return self.train_step(batch, criterion) @@ -406,3 +364,8 @@ class GlowTTS(TTSModel): self.eval() self.store_inverse() assert not self.training + + def get_criterion(self): + from TTS.tts.layers.losses import GlowTTSLoss # pylint: disable=import-outside-toplevel + + return GlowTTSLoss() diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index 44a47722..2eb70a6b 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -1,4 +1,7 @@ +from dataclasses import dataclass, field + import torch +from coqpit import Coqpit from torch import nn from TTS.tts.layers.feed_forward.decoder import Decoder @@ -6,25 +9,16 @@ from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path -from TTS.tts.models.abstract_tts import TTSModel +from TTS.tts.models.base_tts import BaseTTS from TTS.tts.utils.data import sequence_mask from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor -class SpeedySpeech(TTSModel): - """Speedy Speech model - https://arxiv.org/abs/2008.03802 - - Encoder -> DurationPredictor -> Decoder - - This model is able to achieve a reasonable performance with only - ~3M model parameters and convolutional layers. - - This model requires precomputed phoneme durations to train a duration predictor. At inference - it only uses the duration predictor to compute durations and expand encoder outputs respectively. - +@dataclass +class SpeedySpeechArgs(Coqpit): + """ Args: num_chars (int): number of unique input to characters out_channels (int): number of output tensor channels. It is equal to the expected spectrogram size. @@ -36,49 +30,107 @@ class SpeedySpeech(TTSModel): decoder_type (str, optional): decoder type. Defaults to 'residual_conv_bn'. decoder_params (dict, optional): set decoder parameters depending on 'decoder_type'. Defaults to { "kernel_size": 4, "dilations": 4 * [1, 2, 4, 8] + [1], "num_conv_blocks": 2, "num_res_blocks": 17 }. num_speakers (int, optional): number of speakers for multi-speaker training. Defaults to 0. - external_c (bool, optional): enable external speaker embeddings. Defaults to False. - c_in_channels (int, optional): number of channels in speaker embedding vectors. Defaults to 0. + use_d_vector (bool, optional): enable external speaker embeddings. Defaults to False. + d_vector_dim (int, optional): number of channels in speaker embedding vectors. Defaults to 0. """ - # pylint: disable=dangerous-default-value - - def __init__( - self, - num_chars, - out_channels, - hidden_channels, - positional_encoding=True, - length_scale=1, - encoder_type="residual_conv_bn", - encoder_params={"kernel_size": 4, "dilations": 4 * [1, 2, 4] + [1], "num_conv_blocks": 2, "num_res_blocks": 13}, - decoder_type="residual_conv_bn", - decoder_params={ + num_chars: int = None + out_channels: int = 80 + hidden_channels: int = 128 + num_speakers: int = 0 + positional_encoding: bool = True + length_scale: int = 1 + encoder_type: str = "residual_conv_bn" + encoder_params: dict = field( + default_factory=lambda: { + "kernel_size": 4, + "dilations": 4 * [1, 2, 4] + [1], + "num_conv_blocks": 2, + "num_res_blocks": 13, + } + ) + decoder_type: str = "residual_conv_bn" + decoder_params: dict = field( + default_factory=lambda: { "kernel_size": 4, "dilations": 4 * [1, 2, 4, 8] + [1], "num_conv_blocks": 2, "num_res_blocks": 17, - }, - num_speakers=0, - external_c=False, - c_in_channels=0, - ): + } + ) + use_d_vector: bool = False + d_vector_dim: int = 0 + +class SpeedySpeech(BaseTTS): + """Speedy Speech model + https://arxiv.org/abs/2008.03802 + + Encoder -> DurationPredictor -> Decoder + + Paper abstract: + While recent neural sequence-to-sequence models have greatly improved the quality of speech + synthesis, there has not been a system capable of fast training, fast inference and high-quality audio synthesis + at the same time. We propose a student-teacher network capable of high-quality faster-than-real-time spectrogram + synthesis, with low requirements on computational resources and fast training time. We show that self-attention + layers are not necessary for generation of high quality audio. We utilize simple convolutional blocks with + residual connections in both student and teacher networks and use only a single attention layer in the teacher + model. Coupled with a MelGAN vocoder, our model's voice quality was rated significantly higher than Tacotron 2. + Our model can be efficiently trained on a single GPU and can run in real time even on a CPU. We provide both + our source code and audio samples in our GitHub repository. + + Notes: + The vanilla model is able to achieve a reasonable performance with only + ~3M model parameters and convolutional layers. + + This model requires precomputed phoneme durations to train a duration predictor. At inference + it only uses the duration predictor to compute durations and expand encoder outputs respectively. + + You can also mix and match different encoder and decoder networks beyond the paper. + + Check `SpeedySpeechArgs` for arguments. + """ + + # pylint: disable=dangerous-default-value + + def __init__(self, config: Coqpit): super().__init__() - self.length_scale = float(length_scale) if isinstance(length_scale, int) else length_scale - self.emb = nn.Embedding(num_chars, hidden_channels) - self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, encoder_params, c_in_channels) - if positional_encoding: - self.pos_encoder = PositionalEncoding(hidden_channels) - self.decoder = Decoder(out_channels, hidden_channels, decoder_type, decoder_params) - self.duration_predictor = DurationPredictor(hidden_channels + c_in_channels) + self.config = config - if num_speakers > 1 and not external_c: + if "characters" in config: + chars, self.config = self.get_characters(config) + self.num_chars = len(chars) + + self.length_scale = ( + float(config.model_args.length_scale) + if isinstance(config.model_args.length_scale, int) + else config.model_args.length_scale + ) + self.emb = nn.Embedding(config.model_args.num_chars, config.model_args.hidden_channels) + self.encoder = Encoder( + config.model_args.hidden_channels, + config.model_args.hidden_channels, + config.model_args.encoder_type, + config.model_args.encoder_params, + config.model_args.d_vector_dim, + ) + if config.model_args.positional_encoding: + self.pos_encoder = PositionalEncoding(config.model_args.hidden_channels) + self.decoder = Decoder( + config.model_args.out_channels, + config.model_args.hidden_channels, + config.model_args.decoder_type, + config.model_args.decoder_params, + ) + self.duration_predictor = DurationPredictor(config.model_args.hidden_channels + config.model_args.d_vector_dim) + + if config.model_args.num_speakers > 1 and not config.model_args.use_d_vector: # speaker embedding layer - self.emb_g = nn.Embedding(num_speakers, c_in_channels) + self.emb_g = nn.Embedding(config.model_args.num_speakers, config.model_args.d_vector_dim) nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) - if c_in_channels > 0 and c_in_channels != hidden_channels: - self.proj_g = nn.Conv1d(c_in_channels, hidden_channels, 1) + if config.model_args.d_vector_dim > 0 and config.model_args.d_vector_dim != config.model_args.hidden_channels: + self.proj_g = nn.Conv1d(config.model_args.d_vector_dim, config.model_args.hidden_channels, 1) @staticmethod def expand_encoder_outputs(en, dr, x_mask, y_mask): @@ -244,7 +296,7 @@ class SpeedySpeech(TTSModel): # Sample audio train_audio = ap.inv_melspectrogram(pred_spec.T) - return figures, train_audio + return figures, {"audio": train_audio} def eval_step(self, batch: dict, criterion: nn.Module): return self.train_step(batch, criterion) @@ -260,3 +312,8 @@ class SpeedySpeech(TTSModel): if eval: self.eval() assert not self.training + + def get_criterion(self): + from TTS.tts.layers.losses import SpeedySpeechLoss # pylint: disable=import-outside-toplevel + + return SpeedySpeechLoss(self.config) diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 317d1905..95b4a358 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -1,166 +1,86 @@ # coding: utf-8 + +from typing import Dict, Tuple + import torch +from coqpit import Coqpit from torch import nn from TTS.tts.layers.tacotron.gst_layers import GST from TTS.tts.layers.tacotron.tacotron import Decoder, Encoder, PostCBHG -from TTS.tts.models.tacotron_abstract import TacotronAbstract +from TTS.tts.models.base_tacotron import BaseTacotron from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor -class Tacotron(TacotronAbstract): +class Tacotron(BaseTacotron): """Tacotron as in https://arxiv.org/abs/1703.10135 - It's an autoregressive encoder-attention-decoder-postnet architecture. - - Args: - num_chars (int): number of input characters to define the size of embedding layer. - num_speakers (int): number of speakers in the dataset. >1 enables multi-speaker training and model learns speaker embeddings. - r (int): initial model reduction rate. - postnet_output_dim (int, optional): postnet output channels. Defaults to 80. - decoder_output_dim (int, optional): decoder output channels. Defaults to 80. - attn_type (str, optional): attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'. - attn_win (bool, optional): enable/disable attention windowing. - It especially useful at inference to keep attention alignment diagonal. Defaults to False. - attn_norm (str, optional): Attention normalization method. "sigmoid" or "softmax". Defaults to "softmax". - prenet_type (str, optional): prenet type for the decoder. Defaults to "original". - prenet_dropout (bool, optional): prenet dropout rate. Defaults to True. - prenet_dropout_at_inference (bool, optional): use dropout at inference time. This leads to a better quality for - some models. Defaults to False. - forward_attn (bool, optional): enable/disable forward attention. - It is only valid if ```attn_type``` is ```original```. Defaults to False. - trans_agent (bool, optional): enable/disable transition agent in forward attention. Defaults to False. - forward_attn_mask (bool, optional): enable/disable extra masking over forward attention. Defaults to False. - location_attn (bool, optional): enable/disable location sensitive attention. - It is only valid if ```attn_type``` is ```original```. Defaults to True. - attn_K (int, optional): Number of attention heads for GMM attention. Defaults to 5. - separate_stopnet (bool, optional): enable/disable separate stopnet training without only gradient - flow from stopnet to the rest of the model. Defaults to True. - bidirectional_decoder (bool, optional): enable/disable bidirectional decoding. Defaults to False. - double_decoder_consistency (bool, optional): enable/disable double decoder consistency. Defaults to False. - ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None. - encoder_in_features (int, optional): input channels for the encoder. Defaults to 512. - decoder_in_features (int, optional): input channels for the decoder. Defaults to 512. - d_vector_dim (int, optional): external speaker conditioning vector channels. Defaults to None. - use_gst (bool, optional): enable/disable Global style token module. - gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. - memory_size (int, optional): size of the history queue fed to the prenet. Model feeds the last ```memory_size``` - output frames to the prenet. - gradual_trainin (List): Gradual training schedule. If None or `[]`, no gradual training is used. - Defaults to `[]`. - max_decoder_steps (int): Maximum number of steps allowed for the decoder. Defaults to 10000. + Check `TacotronConfig` for the arguments. """ - def __init__( - self, - num_chars, - num_speakers, - r=5, - postnet_output_dim=1025, - decoder_output_dim=80, - attn_type="original", - attn_win=False, - attn_norm="sigmoid", - prenet_type="original", - prenet_dropout=True, - prenet_dropout_at_inference=False, - forward_attn=False, - trans_agent=False, - forward_attn_mask=False, - location_attn=True, - attn_K=5, - separate_stopnet=True, - bidirectional_decoder=False, - double_decoder_consistency=False, - ddc_r=None, - encoder_in_features=256, - decoder_in_features=256, - d_vector_dim=None, - use_gst=False, - gst=None, - memory_size=5, - gradual_training=None, - max_decoder_steps=500, - ): - super().__init__( - num_chars, - num_speakers, - r, - postnet_output_dim, - decoder_output_dim, - attn_type, - attn_win, - attn_norm, - prenet_type, - prenet_dropout, - prenet_dropout_at_inference, - forward_attn, - trans_agent, - forward_attn_mask, - location_attn, - attn_K, - separate_stopnet, - bidirectional_decoder, - double_decoder_consistency, - ddc_r, - encoder_in_features, - decoder_in_features, - d_vector_dim, - use_gst, - gst, - gradual_training, - ) + def __init__(self, config: Coqpit): + super().__init__(config) - # speaker embedding layers + self.num_chars, self.config = self.get_characters(config) + + # pass all config fields to `self` + # for fewer code change + for key in config: + setattr(self, key, config[key]) + + # speaker embedding layer if self.num_speakers > 1: - if not self.use_d_vectors: - d_vector_dim = 256 - self.speaker_embedding = nn.Embedding(self.num_speakers, d_vector_dim) - self.speaker_embedding.weight.data.normal_(0, 0.3) + self.init_multispeaker(config) # speaker and gst embeddings is concat in decoder input if self.num_speakers > 1: - self.decoder_in_features += d_vector_dim # add speaker embedding dim + self.decoder_in_features += self.embedded_speaker_dim # add speaker embedding dim + + if self.use_gst: + self.decoder_in_features += self.gst.gst_embedding_dim # embedding layer - self.embedding = nn.Embedding(num_chars, 256, padding_idx=0) + self.embedding = nn.Embedding(self.num_chars, 256, padding_idx=0) self.embedding.weight.data.normal_(0, 0.3) # base model layers self.encoder = Encoder(self.encoder_in_features) self.decoder = Decoder( self.decoder_in_features, - decoder_output_dim, - r, - memory_size, - attn_type, - attn_win, - attn_norm, - prenet_type, - prenet_dropout, - forward_attn, - trans_agent, - forward_attn_mask, - location_attn, - attn_K, - separate_stopnet, - max_decoder_steps, + self.decoder_output_dim, + self.r, + self.memory_size, + self.attention_type, + self.windowing, + self.attention_norm, + self.prenet_type, + self.prenet_dropout, + self.use_forward_attn, + self.transition_agent, + self.forward_attn_mask, + self.location_attn, + self.attention_heads, + self.separate_stopnet, + self.max_decoder_steps, ) - self.postnet = PostCBHG(decoder_output_dim) - self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, postnet_output_dim) + self.postnet = PostCBHG(self.decoder_output_dim) + self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, self.out_channels) # setup prenet dropout - self.decoder.prenet.dropout_at_inference = prenet_dropout_at_inference + self.decoder.prenet.dropout_at_inference = self.prenet_dropout_at_inference # global style token layers if self.gst and self.use_gst: self.gst_layer = GST( - num_mel=decoder_output_dim, - d_vector_dim=d_vector_dim, - num_heads=gst.gst_num_heads, - num_style_tokens=gst.gst_num_style_tokens, - gst_embedding_dim=gst.gst_embedding_dim, + num_mel=self.decoder_output_dim, + d_vector_dim=self.d_vector_dim + if self.config.gst.gst_use_speaker_embedding and self.use_speaker_embedding + else None, + num_heads=self.gst.gst_num_heads, + num_style_tokens=self.gst.gst_num_style_tokens, + gst_embedding_dim=self.gst.gst_embedding_dim, ) # backward pass decoder if self.bidirectional_decoder: @@ -169,21 +89,21 @@ class Tacotron(TacotronAbstract): if self.double_decoder_consistency: self.coarse_decoder = Decoder( self.decoder_in_features, - decoder_output_dim, - ddc_r, - memory_size, - attn_type, - attn_win, - attn_norm, - prenet_type, - prenet_dropout, - forward_attn, - trans_agent, - forward_attn_mask, - location_attn, - attn_K, - separate_stopnet, - max_decoder_steps, + self.decoder_output_dim, + self.ddc_r, + self.memory_size, + self.attention_type, + self.windowing, + self.attention_norm, + self.prenet_type, + self.prenet_dropout, + self.use_forward_attn, + self.transition_agent, + self.forward_attn_mask, + self.location_attn, + self.attention_heads, + self.separate_stopnet, + self.max_decoder_steps, ) def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, aux_input=None): @@ -205,7 +125,9 @@ class Tacotron(TacotronAbstract): # global style token if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, aux_input["d_vectors"]) + encoder_outputs = self.compute_gst( + encoder_outputs, mel_specs, aux_input["d_vectors"] if "d_vectors" in aux_input else None + ) # speaker embedding if self.num_speakers > 1: if not self.use_d_vectors: @@ -341,7 +263,7 @@ class Tacotron(TacotronAbstract): loss_dict["align_error"] = align_error return outputs, loss_dict - def train_log(self, ap, batch, outputs): + def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict) -> Tuple[Dict, Dict]: postnet_outputs = outputs["model_outputs"] alignments = outputs["alignments"] alignments_backward = outputs["alignments_backward"] @@ -362,7 +284,7 @@ class Tacotron(TacotronAbstract): # Sample audio train_audio = ap.inv_spectrogram(pred_spec.T) - return figures, train_audio + return figures, {"audio": train_audio} def eval_step(self, batch, criterion): return self.train_step(batch, criterion) diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index d56bd988..eaca3ff8 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -1,160 +1,84 @@ # coding: utf-8 + +from typing import Dict, Tuple + import torch +from coqpit import Coqpit from torch import nn from TTS.tts.layers.tacotron.gst_layers import GST from TTS.tts.layers.tacotron.tacotron2 import Decoder, Encoder, Postnet -from TTS.tts.models.tacotron_abstract import TacotronAbstract +from TTS.tts.models.base_tacotron import BaseTacotron from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor -class Tacotron2(TacotronAbstract): +class Tacotron2(BaseTacotron): """Tacotron2 as in https://arxiv.org/abs/1712.05884 - - It's an autoregressive encoder-attention-decoder-postnet architecture. - - Args: - num_chars (int): number of input characters to define the size of embedding layer. - num_speakers (int): number of speakers in the dataset. >1 enables multi-speaker training and model learns speaker embeddings. - r (int): initial model reduction rate. - postnet_output_dim (int, optional): postnet output channels. Defaults to 80. - decoder_output_dim (int, optional): decoder output channels. Defaults to 80. - attn_type (str, optional): attention type. Check ```TTS.tts.layers.tacotron.common_layers.init_attn```. Defaults to 'original'. - attn_win (bool, optional): enable/disable attention windowing. - It especially useful at inference to keep attention alignment diagonal. Defaults to False. - attn_norm (str, optional): Attention normalization method. "sigmoid" or "softmax". Defaults to "softmax". - prenet_type (str, optional): prenet type for the decoder. Defaults to "original". - prenet_dropout (bool, optional): prenet dropout rate. Defaults to True. - prenet_dropout_at_inference (bool, optional): use dropout at inference time. This leads to a better quality for - some models. Defaults to False. - forward_attn (bool, optional): enable/disable forward attention. - It is only valid if ```attn_type``` is ```original```. Defaults to False. - trans_agent (bool, optional): enable/disable transition agent in forward attention. Defaults to False. - forward_attn_mask (bool, optional): enable/disable extra masking over forward attention. Defaults to False. - location_attn (bool, optional): enable/disable location sensitive attention. - It is only valid if ```attn_type``` is ```original```. Defaults to True. - attn_K (int, optional): Number of attention heads for GMM attention. Defaults to 5. - separate_stopnet (bool, optional): enable/disable separate stopnet training without only gradient - flow from stopnet to the rest of the model. Defaults to True. - bidirectional_decoder (bool, optional): enable/disable bidirectional decoding. Defaults to False. - double_decoder_consistency (bool, optional): enable/disable double decoder consistency. Defaults to False. - ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None. - encoder_in_features (int, optional): input channels for the encoder. Defaults to 512. - decoder_in_features (int, optional): input channels for the decoder. Defaults to 512. - d_vector_dim (int, optional): external speaker conditioning vector channels. Defaults to None. - use_gst (bool, optional): enable/disable Global style token module. - gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. - gradual_training (List): Gradual training schedule. If None or `[]`, no gradual training is used. - Defaults to `[]`. - max_decoder_steps (int): Maximum number of steps allowed for the decoder. Defaults to 10000. + Check `TacotronConfig` for the arguments. """ - def __init__( - self, - num_chars, - num_speakers, - r, - postnet_output_dim=80, - decoder_output_dim=80, - attn_type="original", - attn_win=False, - attn_norm="softmax", - prenet_type="original", - prenet_dropout=True, - prenet_dropout_at_inference=False, - forward_attn=False, - trans_agent=False, - forward_attn_mask=False, - location_attn=True, - attn_K=5, - separate_stopnet=True, - bidirectional_decoder=False, - double_decoder_consistency=False, - ddc_r=None, - encoder_in_features=512, - decoder_in_features=512, - d_vector_dim=None, - use_gst=False, - gst=None, - gradual_training=None, - max_decoder_steps=500, - ): - super().__init__( - num_chars, - num_speakers, - r, - postnet_output_dim, - decoder_output_dim, - attn_type, - attn_win, - attn_norm, - prenet_type, - prenet_dropout, - prenet_dropout_at_inference, - forward_attn, - trans_agent, - forward_attn_mask, - location_attn, - attn_K, - separate_stopnet, - bidirectional_decoder, - double_decoder_consistency, - ddc_r, - encoder_in_features, - decoder_in_features, - d_vector_dim, - use_gst, - gst, - gradual_training, - ) + def __init__(self, config: Coqpit): + super().__init__(config) + + chars, self.config = self.get_characters(config) + self.num_chars = len(chars) + self.decoder_output_dim = config.out_channels + + # pass all config fields to `self` + # for fewer code change + for key in config: + setattr(self, key, config[key]) # speaker embedding layer if self.num_speakers > 1: - if not self.use_d_vectors: - d_vector_dim = 512 - self.speaker_embedding = nn.Embedding(self.num_speakers, d_vector_dim) - self.speaker_embedding.weight.data.normal_(0, 0.3) + self.init_multispeaker(config) # speaker and gst embeddings is concat in decoder input if self.num_speakers > 1: - self.decoder_in_features += d_vector_dim # add speaker embedding dim + self.decoder_in_features += self.embedded_speaker_dim # add speaker embedding dim + + if self.use_gst: + self.decoder_in_features += self.gst.gst_embedding_dim # embedding layer - self.embedding = nn.Embedding(num_chars, 512, padding_idx=0) + self.embedding = nn.Embedding(self.num_chars, 512, padding_idx=0) # base model layers self.encoder = Encoder(self.encoder_in_features) self.decoder = Decoder( self.decoder_in_features, self.decoder_output_dim, - r, - attn_type, - attn_win, - attn_norm, - prenet_type, - prenet_dropout, - forward_attn, - trans_agent, - forward_attn_mask, - location_attn, - attn_K, - separate_stopnet, - max_decoder_steps, + self.r, + self.attention_type, + self.attention_win, + self.attention_norm, + self.prenet_type, + self.prenet_dropout, + self.use_forward_attn, + self.transition_agent, + self.forward_attn_mask, + self.location_attn, + self.attention_heads, + self.separate_stopnet, + self.max_decoder_steps, ) - self.postnet = Postnet(self.postnet_output_dim) + self.postnet = Postnet(self.out_channels) # setup prenet dropout - self.decoder.prenet.dropout_at_g = prenet_dropout_at_inference + self.decoder.prenet.dropout_at_inference = self.prenet_dropout_at_inference # global style token layers - if self.gst and use_gst: + if self.gst and self.use_gst: self.gst_layer = GST( - num_mel=decoder_output_dim, - d_vector_dim=d_vector_dim, - num_heads=gst.gst_num_heads, - num_style_tokens=gst.gst_num_style_tokens, - gst_embedding_dim=gst.gst_embedding_dim, + num_mel=self.decoder_output_dim, + d_vector_dim=self.d_vector_dim + if self.config.gst.gst_use_speaker_embedding and self.use_speaker_embedding + else None, + num_heads=self.gst.gst_num_heads, + num_style_tokens=self.gst.gst_num_style_tokens, + gst_embedding_dim=self.gst.gst_embedding_dim, ) # backward pass decoder @@ -165,19 +89,19 @@ class Tacotron2(TacotronAbstract): self.coarse_decoder = Decoder( self.decoder_in_features, self.decoder_output_dim, - ddc_r, - attn_type, - attn_win, - attn_norm, - prenet_type, - prenet_dropout, - forward_attn, - trans_agent, - forward_attn_mask, - location_attn, - attn_K, - separate_stopnet, - max_decoder_steps, + self.ddc_r, + self.attention_type, + self.attention_win, + self.attention_norm, + self.prenet_type, + self.prenet_dropout, + self.use_forward_attn, + self.transition_agent, + self.forward_attn_mask, + self.location_attn, + self.attention_heads, + self.separate_stopnet, + self.max_decoder_steps, ) @staticmethod @@ -206,7 +130,9 @@ class Tacotron2(TacotronAbstract): encoder_outputs = self.encoder(embedded_inputs, text_lengths) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, aux_input["d_vectors"]) + encoder_outputs = self.compute_gst( + encoder_outputs, mel_specs, aux_input["d_vectors"] if "d_vectors" in aux_input else None + ) if self.num_speakers > 1: if not self.use_d_vectors: # B x 1 x speaker_embed_dim @@ -342,7 +268,7 @@ class Tacotron2(TacotronAbstract): loss_dict["align_error"] = align_error return outputs, loss_dict - def train_log(self, ap, batch, outputs): + def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict) -> Tuple[Dict, Dict]: postnet_outputs = outputs["model_outputs"] alignments = outputs["alignments"] alignments_backward = outputs["alignments_backward"] @@ -363,7 +289,7 @@ class Tacotron2(TacotronAbstract): # Sample audio train_audio = ap.inv_melspectrogram(pred_spec.T) - return figures, train_audio + return figures, {"audio": train_audio} def eval_step(self, batch, criterion): return self.train_step(batch, criterion) diff --git a/TTS/tts/tf/models/tacotron2.py b/TTS/tts/tf/models/tacotron2.py index 9cc62070..7a1d695d 100644 --- a/TTS/tts/tf/models/tacotron2.py +++ b/TTS/tts/tf/models/tacotron2.py @@ -12,7 +12,7 @@ class Tacotron2(keras.models.Model): num_chars, num_speakers, r, - postnet_output_dim=80, + out_channels=80, decoder_output_dim=80, attn_type="original", attn_win=False, @@ -31,7 +31,7 @@ class Tacotron2(keras.models.Model): super().__init__() self.r = r self.decoder_output_dim = decoder_output_dim - self.postnet_output_dim = postnet_output_dim + self.out_channels = out_channels self.bidirectional_decoder = bidirectional_decoder self.num_speakers = num_speakers self.speaker_embed_dim = 256 @@ -58,7 +58,7 @@ class Tacotron2(keras.models.Model): name="decoder", enable_tflite=enable_tflite, ) - self.postnet = Postnet(postnet_output_dim, 5, name="postnet") + self.postnet = Postnet(out_channels, 5, name="postnet") @tf.function(experimental_relax_shapes=True) def call(self, characters, text_lengths=None, frames=None, training=None): diff --git a/TTS/vocoder/models/base_vocoder.py b/TTS/vocoder/models/base_vocoder.py new file mode 100644 index 00000000..f879cd42 --- /dev/null +++ b/TTS/vocoder/models/base_vocoder.py @@ -0,0 +1,20 @@ +from TTS.model import BaseModel + +# pylint: skip-file + + +class BaseVocoder(BaseModel): + """Base `vocoder` class. Every new `vocoder` model must inherit this. + + It defines `vocoder` specific functions on top of `Model`. + + Notes on input/output tensor shapes: + Any input or output tensor of the model must be shaped as + + - 3D tensors `batch x time x channels` + - 2D tensors `batch x channels` + - 1D tensors `batch x 1` + """ + + def __init__(self): + super().__init__() From 51005cdab4a908ef65d8c424dca4bf3aa9c7b867 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:36:47 +0200 Subject: [PATCH 199/258] Update `tts.models.setup_model` --- TTS/tts/models/__init__.py | 144 ++++++++++--------------------------- 1 file changed, 38 insertions(+), 106 deletions(-) diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py index 2a951267..c6390beb 100644 --- a/TTS/tts/models/__init__.py +++ b/TTS/tts/models/__init__.py @@ -1,110 +1,42 @@ +from TTS.tts.utils.text.symbols import make_symbols, parse_symbols from TTS.utils.generic_utils import find_module -def setup_model(num_chars, num_speakers, c, d_vector_dim=None): - print(" > Using model: {}".format(c.model)) - MyModel = find_module("TTS.tts.models", c.model.lower()) - if c.model.lower() in "tacotron": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - num_speakers=num_speakers, - r=c.r, - postnet_output_dim=int(c.audio["fft_size"] / 2 + 1), - decoder_output_dim=c.audio["num_mels"], - use_gst=c.use_gst, - gst=c.gst, - memory_size=c.memory_size, - attn_type=c.attention_type, - attn_win=c.windowing, - attn_norm=c.attention_norm, - prenet_type=c.prenet_type, - prenet_dropout=c.prenet_dropout, - prenet_dropout_at_inference=c.prenet_dropout_at_inference, - forward_attn=c.use_forward_attn, - trans_agent=c.transition_agent, - forward_attn_mask=c.forward_attn_mask, - location_attn=c.location_attn, - attn_K=c.attention_heads, - separate_stopnet=c.separate_stopnet, - bidirectional_decoder=c.bidirectional_decoder, - double_decoder_consistency=c.double_decoder_consistency, - ddc_r=c.ddc_r, - d_vector_dim=d_vector_dim, - max_decoder_steps=c.max_decoder_steps, - ) - elif c.model.lower() == "tacotron2": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - num_speakers=num_speakers, - r=c.r, - postnet_output_dim=c.audio["num_mels"], - decoder_output_dim=c.audio["num_mels"], - use_gst=c.use_gst, - gst=c.gst, - attn_type=c.attention_type, - attn_win=c.windowing, - attn_norm=c.attention_norm, - prenet_type=c.prenet_type, - prenet_dropout=c.prenet_dropout, - prenet_dropout_at_inference=c.prenet_dropout_at_inference, - forward_attn=c.use_forward_attn, - trans_agent=c.transition_agent, - forward_attn_mask=c.forward_attn_mask, - location_attn=c.location_attn, - attn_K=c.attention_heads, - separate_stopnet=c.separate_stopnet, - bidirectional_decoder=c.bidirectional_decoder, - double_decoder_consistency=c.double_decoder_consistency, - ddc_r=c.ddc_r, - d_vector_dim=d_vector_dim, - max_decoder_steps=c.max_decoder_steps, - ) - elif c.model.lower() == "glow_tts": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - hidden_channels_enc=c["hidden_channels_encoder"], - hidden_channels_dec=c["hidden_channels_decoder"], - hidden_channels_dp=c["hidden_channels_duration_predictor"], - out_channels=c.audio["num_mels"], - encoder_type=c.encoder_type, - encoder_params=c.encoder_params, - use_encoder_prenet=c["use_encoder_prenet"], - inference_noise_scale=c.inference_noise_scale, - num_flow_blocks_dec=12, - kernel_size_dec=5, - dilation_rate=1, - num_block_layers=4, - dropout_p_dec=0.05, - num_speakers=num_speakers, - c_in_channels=0, - num_splits=4, - num_squeeze=2, - sigmoid_scale=False, - mean_only=True, - d_vector_dim=d_vector_dim, - ) - elif c.model.lower() == "speedy_speech": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - out_channels=c.audio["num_mels"], - hidden_channels=c["hidden_channels"], - positional_encoding=c["positional_encoding"], - encoder_type=c["encoder_type"], - encoder_params=c["encoder_params"], - decoder_type=c["decoder_type"], - decoder_params=c["decoder_params"], - c_in_channels=0, - ) - elif c.model.lower() == "align_tts": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - out_channels=c.audio["num_mels"], - hidden_channels=c["hidden_channels"], - hidden_channels_dp=c["hidden_channels_dp"], - encoder_type=c["encoder_type"], - encoder_params=c["encoder_params"], - decoder_type=c["decoder_type"], - decoder_params=c["decoder_params"], - c_in_channels=0, - ) +def setup_model(config): + print(" > Using model: {}".format(config.model)) + + MyModel = find_module("TTS.tts.models", config.model.lower()) + # define set of characters used by the model + if config.characters is not None: + # set characters from config + symbols, phonemes = make_symbols(**config.characters.to_dict()) # pylint: disable=redefined-outer-name + else: + from TTS.tts.utils.text.symbols import phonemes, symbols # pylint: disable=import-outside-toplevel + + # use default characters and assign them to config + config.characters = parse_symbols() + num_chars = len(phonemes) if config.use_phonemes else len(symbols) + # consider special `blank` character if `add_blank` is set True + num_chars = num_chars + getattr(config, "add_blank", False) + config.num_chars = num_chars + # compatibility fix + if "model_params" in config: + config.model_params.num_chars = num_chars + if "model_args" in config: + config.model_args.num_chars = num_chars + model = MyModel(config) return model + + +# TODO; class registery +# def import_models(models_dir, namespace): +# for file in os.listdir(models_dir): +# path = os.path.join(models_dir, file) +# if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)): +# model_name = file[: file.find(".py")] if file.endswith(".py") else file +# importlib.import_module(namespace + "." + model_name) +# +# +## automatically import any Python files in the models/ directory +# models_dir = os.path.dirname(__file__) +# import_models(models_dir, "TTS.tts.models") From e949e7ad583c9de11924aa80480e36604e79ad7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:37:23 +0200 Subject: [PATCH 200/258] Update vocoder models --- TTS/vocoder/models/gan.py | 246 +++++++++++++++++++++++++ TTS/vocoder/models/wavegrad.py | 190 ++++++++++++++++--- TTS/vocoder/models/wavernn.py | 322 ++++++++++++++++++++++++--------- 3 files changed, 639 insertions(+), 119 deletions(-) create mode 100644 TTS/vocoder/models/gan.py diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py new file mode 100644 index 00000000..58d6532e --- /dev/null +++ b/TTS/vocoder/models/gan.py @@ -0,0 +1,246 @@ +from inspect import signature +from typing import Dict, List, Tuple + +import numpy as np +import torch +from coqpit import Coqpit +from torch import nn +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler + +from TTS.utils.audio import AudioProcessor +from TTS.utils.trainer_utils import get_optimizer, get_scheduler +from TTS.vocoder.datasets.gan_dataset import GANDataset +from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss +from TTS.vocoder.models import setup_discriminator, setup_generator +from TTS.vocoder.models.base_vocoder import BaseVocoder +from TTS.vocoder.utils.generic_utils import plot_results + + +class GAN(BaseVocoder): + def __init__(self, config: Coqpit): + """Wrap a generator and a discriminator network. It provides a compatible interface for the trainer. + It also helps mixing and matching different generator and disciminator networks easily. + + Args: + config (Coqpit): Model configuration. + + Examples: + Initializing the GAN model with HifiGAN generator and discriminator. + >>> from TTS.vocoder.configs import HifiganConfig + >>> config = HifiganConfig() + >>> model = GAN(config) + """ + super().__init__() + self.config = config + self.model_g = setup_generator(config) + self.model_d = setup_discriminator(config) + self.train_disc = False # if False, train only the generator. + self.y_hat_g = None # the last generator prediction to be passed onto the discriminator + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.model_g.forward(x) + + def inference(self, x: torch.Tensor) -> torch.Tensor: + return self.model_g.inference(x) + + def train_step(self, batch: Dict, criterion: Dict, optimizer_idx: int) -> Tuple[Dict, Dict]: + outputs = None + loss_dict = None + + x = batch["input"] + y = batch["waveform"] + + if optimizer_idx not in [0, 1]: + raise ValueError(" [!] Unexpected `optimizer_idx`.") + + if optimizer_idx == 0: + # GENERATOR + # generator pass + y_hat = self.model_g(x)[:, :, : y.size(2)] + self.y_hat_g = y_hat # save for discriminator + y_hat_sub = None + y_sub = None + + # PQMF formatting + if y_hat.shape[1] > 1: + y_hat_sub = y_hat + y_hat = self.model_g.pqmf_synthesis(y_hat) + self.y_hat_g = y_hat # save for discriminator + y_sub = self.model_g.pqmf_analysis(y) + + scores_fake, feats_fake, feats_real = None, None, None + if self.train_disc: + + if len(signature(self.model_d.forward).parameters) == 2: + D_out_fake = self.model_d(y_hat, x) + else: + D_out_fake = self.model_d(y_hat) + D_out_real = None + + if self.config.use_feat_match_loss: + with torch.no_grad(): + D_out_real = self.model_d(y) + + # format D outputs + if isinstance(D_out_fake, tuple): + scores_fake, feats_fake = D_out_fake + if D_out_real is None: + feats_real = None + else: + _, feats_real = D_out_real + else: + scores_fake = D_out_fake + feats_fake, feats_real = None, None + + # compute losses + loss_dict = criterion[optimizer_idx](y_hat, y, scores_fake, feats_fake, feats_real, y_hat_sub, y_sub) + outputs = {"model_outputs": y_hat} + + if optimizer_idx == 1: + # DISCRIMINATOR + if self.train_disc: + # use different samples for G and D trainings + if self.config.diff_samples_for_G_and_D: + x_d = batch["input_disc"] + y_d = batch["waveform_disc"] + # use a different sample than generator + with torch.no_grad(): + y_hat = self.model_g(x_d) + + # PQMF formatting + if y_hat.shape[1] > 1: + y_hat = self.model_g.pqmf_synthesis(y_hat) + else: + # use the same samples as generator + x_d = x.clone() + y_d = y.clone() + y_hat = self.y_hat_g + + # run D with or without cond. features + if len(signature(self.model_d.forward).parameters) == 2: + D_out_fake = self.model_d(y_hat.detach().clone(), x_d) + D_out_real = self.model_d(y_d, x_d) + else: + D_out_fake = self.model_d(y_hat.detach()) + D_out_real = self.model_d(y_d) + + # format D outputs + if isinstance(D_out_fake, tuple): + # self.model_d returns scores and features + scores_fake, feats_fake = D_out_fake + if D_out_real is None: + scores_real, feats_real = None, None + else: + scores_real, feats_real = D_out_real + else: + # model D returns only scores + scores_fake = D_out_fake + scores_real = D_out_real + + # compute losses + loss_dict = criterion[optimizer_idx](scores_fake, scores_real) + outputs = {"model_outputs": y_hat} + + return outputs, loss_dict + + def train_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + y_hat = outputs[0]["model_outputs"] + y = batch["waveform"] + figures = plot_results(y_hat, y, ap, "train") + sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() + audios = {"train/audio": sample_voice} + return figures, audios + + @torch.no_grad() + def eval_step(self, batch: Dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]: + return self.train_step(batch, criterion, optimizer_idx) + + def eval_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + return self.train_log(ap, batch, outputs) + + def load_checkpoint( + self, + config: Coqpit, + checkpoint_path: str, + eval: bool = False, # pylint: disable=unused-argument, redefined-builtin + ) -> None: + state = torch.load(checkpoint_path, map_location=torch.device("cpu")) + # band-aid for older than v0.0.15 GAN models + if "model_disc" in state: + self.model_g.load_checkpoint(config, checkpoint_path, eval) + else: + self.load_state_dict(state["model"]) + if eval: + self.model_d = None + if hasattr(self.model_g, "remove_weight_norm"): + self.model_g.remove_weight_norm() + + def on_train_step_start(self, trainer) -> None: + self.train_disc = trainer.total_steps_done >= self.config.steps_to_start_discriminator + + def get_optimizer(self): + optimizer1 = get_optimizer( + self.config.optimizer, self.config.optimizer_params, self.config.lr_gen, self.model_g + ) + optimizer2 = get_optimizer( + self.config.optimizer, self.config.optimizer_params, self.config.lr_disc, self.model_d + ) + return [optimizer1, optimizer2] + + def get_lr(self): + return [self.config.lr_gen, self.config.lr_disc] + + def get_scheduler(self, optimizer): + scheduler1 = get_scheduler(self.config.lr_scheduler_gen, self.config.lr_scheduler_gen_params, optimizer[0]) + scheduler2 = get_scheduler(self.config.lr_scheduler_disc, self.config.lr_scheduler_disc_params, optimizer[1]) + return [scheduler1, scheduler2] + + @staticmethod + def format_batch(batch): + if isinstance(batch[0], list): + x_G, y_G = batch[0] + x_D, y_D = batch[1] + return {"input": x_G, "waveform": y_G, "input_disc": x_D, "waveform_disc": y_D} + x, y = batch + return {"input": x, "waveform": y} + + def get_data_loader( # pylint: disable=no-self-use + self, + config: Coqpit, + ap: AudioProcessor, + is_eval: True, + data_items: List, + verbose: bool, + num_gpus: int, + ): + dataset = GANDataset( + ap=ap, + items=data_items, + seq_len=config.seq_len, + hop_len=ap.hop_length, + pad_short=config.pad_short, + conv_pad=config.conv_pad, + return_pairs=config.diff_samples_for_G_and_D if "diff_samples_for_G_and_D" in config else False, + is_training=not is_eval, + return_segments=not is_eval, + use_noise_augment=config.use_noise_augment, + use_cache=config.use_cache, + verbose=verbose, + ) + dataset.shuffle_mapping() + sampler = DistributedSampler(dataset, shuffle=True) if num_gpus > 1 else None + loader = DataLoader( + dataset, + batch_size=1 if is_eval else config.batch_size, + shuffle=num_gpus == 0, + drop_last=False, + sampler=sampler, + num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers, + pin_memory=False, + ) + return loader + + def get_criterion(self): + """Return criterions for the optimizers""" + return [GeneratorLoss(self.config), DiscriminatorLoss(self.config)] diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py index 84dde957..03d5160e 100644 --- a/TTS/vocoder/models/wavegrad.py +++ b/TTS/vocoder/models/wavegrad.py @@ -1,65 +1,105 @@ +from dataclasses import dataclass, field +from typing import Dict, List, Tuple + import numpy as np import torch +from coqpit import Coqpit from torch import nn from torch.nn.utils import weight_norm +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler -from ..layers.wavegrad import Conv1d, DBlock, FiLM, UBlock +from TTS.model import BaseModel +from TTS.utils.audio import AudioProcessor +from TTS.utils.trainer_utils import get_optimizer, get_scheduler +from TTS.vocoder.datasets import WaveGradDataset +from TTS.vocoder.layers.wavegrad import Conv1d, DBlock, FiLM, UBlock +from TTS.vocoder.utils.generic_utils import plot_results -class Wavegrad(nn.Module): +@dataclass +class WavegradArgs(Coqpit): + in_channels: int = 80 + out_channels: int = 1 + use_weight_norm: bool = False + y_conv_channels: int = 32 + x_conv_channels: int = 768 + dblock_out_channels: List[int] = field(default_factory=lambda: [128, 128, 256, 512]) + ublock_out_channels: List[int] = field(default_factory=lambda: [512, 512, 256, 128, 128]) + upsample_factors: List[int] = field(default_factory=lambda: [4, 4, 4, 2, 2]) + upsample_dilations: List[List[int]] = field( + default_factory=lambda: [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]] + ) + + +class Wavegrad(BaseModel): + """🐸 🌊 WaveGrad 🌊 model. + Paper - https://arxiv.org/abs/2009.00713 + + Examples: + Initializing the model. + + >>> from TTS.vocoder.configs import WavegradConfig + >>> config = WavegradConfig() + >>> model = Wavegrad(config) + + Paper Abstract: + This paper introduces WaveGrad, a conditional model for waveform generation which estimates gradients of the + data density. The model is built on prior work on score matching and diffusion probabilistic models. It starts + from a Gaussian white noise signal and iteratively refines the signal via a gradient-based sampler conditioned + on the mel-spectrogram. WaveGrad offers a natural way to trade inference speed for sample quality by adjusting + the number of refinement steps, and bridges the gap between non-autoregressive and autoregressive models in + terms of audio quality. We find that it can generate high fidelity audio samples using as few as six iterations. + Experiments reveal WaveGrad to generate high fidelity audio, outperforming adversarial non-autoregressive + baselines and matching a strong likelihood-based autoregressive baseline using fewer sequential operations. + Audio samples are available at this https URL. + """ + # pylint: disable=dangerous-default-value - def __init__( - self, - in_channels=80, - out_channels=1, - use_weight_norm=False, - y_conv_channels=32, - x_conv_channels=768, - dblock_out_channels=[128, 128, 256, 512], - ublock_out_channels=[512, 512, 256, 128, 128], - upsample_factors=[5, 5, 3, 2, 2], - upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], - ): + def __init__(self, config: Coqpit): super().__init__() - - self.use_weight_norm = use_weight_norm - self.hop_len = np.prod(upsample_factors) + self.config = config + self.use_weight_norm = config.model_params.use_weight_norm + self.hop_len = np.prod(config.model_params.upsample_factors) self.noise_level = None self.num_steps = None self.beta = None self.alpha = None self.alpha_hat = None - self.noise_level = None self.c1 = None self.c2 = None self.sigma = None # dblocks - self.y_conv = Conv1d(1, y_conv_channels, 5, padding=2) + self.y_conv = Conv1d(1, config.model_params.y_conv_channels, 5, padding=2) self.dblocks = nn.ModuleList([]) - ic = y_conv_channels - for oc, df in zip(dblock_out_channels, reversed(upsample_factors)): + ic = config.model_params.y_conv_channels + for oc, df in zip(config.model_params.dblock_out_channels, reversed(config.model_params.upsample_factors)): self.dblocks.append(DBlock(ic, oc, df)) ic = oc # film self.film = nn.ModuleList([]) - ic = y_conv_channels - for oc in reversed(ublock_out_channels): + ic = config.model_params.y_conv_channels + for oc in reversed(config.model_params.ublock_out_channels): self.film.append(FiLM(ic, oc)) ic = oc - # ublocks + # ublocksn self.ublocks = nn.ModuleList([]) - ic = x_conv_channels - for oc, uf, ud in zip(ublock_out_channels, upsample_factors, upsample_dilations): + ic = config.model_params.x_conv_channels + for oc, uf, ud in zip( + config.model_params.ublock_out_channels, + config.model_params.upsample_factors, + config.model_params.upsample_dilations, + ): self.ublocks.append(UBlock(ic, oc, uf, ud)) ic = oc - self.x_conv = Conv1d(in_channels, x_conv_channels, 3, padding=1) - self.out_conv = Conv1d(oc, out_channels, 3, padding=1) + self.x_conv = Conv1d(config.model_params.in_channels, config.model_params.x_conv_channels, 3, padding=1) + self.out_conv = Conv1d(oc, config.model_params.out_channels, 3, padding=1) - if use_weight_norm: + if config.model_params.use_weight_norm: self.apply_weight_norm() def forward(self, x, spectrogram, noise_scale): @@ -180,7 +220,7 @@ class Wavegrad(nn.Module): if eval: self.eval() assert not self.training - if self.use_weight_norm: + if self.config.model_params.use_weight_norm: self.remove_weight_norm() betas = np.linspace( config["test_noise_schedule"]["min_val"], @@ -195,3 +235,93 @@ class Wavegrad(nn.Module): config["train_noise_schedule"]["num_steps"], ) self.compute_noise_level(betas) + + def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: + # format data + x = batch["input"] + y = batch["waveform"] + + # set noise scale + noise, x_noisy, noise_scale = self.compute_y_n(y) + + # forward pass + noise_hat = self.forward(x_noisy, x, noise_scale) + + # compute losses + loss = criterion(noise, noise_hat) + return {"model_output": noise_hat}, {"loss": loss} + + def train_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + return None, None + + @torch.no_grad() + def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: + return self.train_step(batch, criterion) + + def eval_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + return None, None + + def test_run(self, ap: AudioProcessor, samples: List[Dict], ouputs: Dict): # pylint: disable=unused-argument + # setup noise schedule and inference + noise_schedule = self.config["test_noise_schedule"] + betas = np.linspace(noise_schedule["min_val"], noise_schedule["max_val"], noise_schedule["num_steps"]) + self.compute_noise_level(betas) + for sample in samples: + x = sample["input"] + y = sample["waveform"] + # compute voice + y_pred = self.inference(x) + # compute spectrograms + figures = plot_results(y_pred, y, ap, "test") + # Sample audio + sample_voice = y_pred[0].squeeze(0).detach().cpu().numpy() + return figures, {"test/audio": sample_voice} + + def get_optimizer(self): + return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self) + + def get_scheduler(self, optimizer): + return get_scheduler(self.config.lr_scheduler, self.config.lr_scheduler_params, optimizer) + + def get_criterion(self): + return torch.nn.L1Loss() + + @staticmethod + def format_batch(batch: Dict) -> Dict: + # return a whole audio segment + m, y = batch[0], batch[1] + y = y.unsqueeze(1) + return {"input": m, "waveform": y} + + def get_data_loader( + self, config: Coqpit, ap: AudioProcessor, is_eval: True, data_items: List, verbose: bool, num_gpus: int + ): + dataset = WaveGradDataset( + ap=ap, + items=data_items, + seq_len=self.config.seq_len, + hop_len=ap.hop_length, + pad_short=self.config.pad_short, + conv_pad=self.config.conv_pad, + is_training=not is_eval, + return_segments=True, + use_noise_augment=False, + use_cache=config.use_cache, + verbose=verbose, + ) + sampler = DistributedSampler(dataset) if num_gpus > 1 else None + loader = DataLoader( + dataset, + batch_size=self.config.batch_size, + shuffle=num_gpus <= 1, + drop_last=False, + sampler=sampler, + num_workers=self.config.num_eval_loader_workers if is_eval else self.config.num_loader_workers, + pin_memory=False, + ) + return loader + + def on_epoch_start(self, trainer): # pylint: disable=unused-argument + noise_schedule = self.config["train_noise_schedule"] + betas = np.linspace(noise_schedule["min_val"], noise_schedule["max_val"], noise_schedule["num_steps"]) + self.compute_noise_level(betas) diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 04040931..a5d89d5a 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -1,13 +1,21 @@ import sys import time +from dataclasses import dataclass, field +from typing import Dict, List, Tuple import numpy as np import torch import torch.nn as nn import torch.nn.functional as F +from coqpit import Coqpit +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler -# fix this -from TTS.utils.audio import AudioProcessor as ap +from TTS.tts.utils.visual import plot_spectrogram +from TTS.utils.audio import AudioProcessor +from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset +from TTS.vocoder.layers.losses import WaveRNNLoss +from TTS.vocoder.models.base_vocoder import BaseVocoder from TTS.vocoder.utils.distribution import sample_from_discretized_mix_logistic, sample_from_gaussian @@ -135,89 +143,145 @@ class Upsample(nn.Module): return m.transpose(1, 2), aux -class WaveRNN(nn.Module): - def __init__( - self, - rnn_dims, - fc_dims, - mode, - mulaw, - pad, - use_aux_net, - use_upsample_net, - upsample_factors, - feat_dims, - compute_dims, - res_out_dims, - num_res_blocks, - hop_length, - sample_rate, - ): +@dataclass +class WavernnArgs(Coqpit): + """🐸 WaveRNN model arguments. + + rnn_dims (int): + Number of hidden channels in RNN layers. Defaults to 512. + fc_dims (int): + Number of hidden channels in fully-conntected layers. Defaults to 512. + compute_dims (int): + Number of hidden channels in the feature ResNet. Defaults to 128. + res_out_dim (int): + Number of hidden channels in the feature ResNet output. Defaults to 128. + num_res_blocks (int): + Number of residual blocks in the ResNet. Defaults to 10. + use_aux_net (bool): + enable/disable the feature ResNet. Defaults to True. + use_upsample_net (bool): + enable/ disable the upsampling networl. If False, basic upsampling is used. Defaults to True. + upsample_factors (list): + Upsampling factors. The multiply of the values must match the `hop_length`. Defaults to ```[4, 8, 8]```. + mode (str): + Output mode of the WaveRNN vocoder. `mold` for Mixture of Logistic Distribution, `gauss` for a single + Gaussian Distribution and `bits` for quantized bits as the model's output. + mulaw (bool): + enable / disable the use of Mulaw quantization for training. Only applicable if `mode == 'bits'`. Defaults + to `True`. + pad (int): + Padding applied to the input feature frames against the convolution layers of the feature network. + Defaults to 2. + """ + + rnn_dims: int = 512 + fc_dims: int = 512 + compute_dims: int = 128 + res_out_dims: int = 128 + num_res_blocks: int = 10 + use_aux_net: bool = True + use_upsample_net: bool = True + upsample_factors: List[int] = field(default_factory=lambda: [4, 8, 8]) + mode: str = "mold" # mold [string], gauss [string], bits [int] + mulaw: bool = True # apply mulaw if mode is bits + pad: int = 2 + feat_dims: int = 80 + + +class Wavernn(BaseVocoder): + def __init__(self, config: Coqpit): + """🐸 WaveRNN model. + Original paper - https://arxiv.org/abs/1802.08435 + Official implementation - https://github.com/fatchord/WaveRNN + + Args: + config (Coqpit): [description] + + Raises: + RuntimeError: [description] + + Examples: + >>> from TTS.vocoder.configs import WavernnConfig + >>> config = WavernnConfig() + >>> model = Wavernn(config) + + Paper Abstract: + Sequential models achieve state-of-the-art results in audio, visual and textual domains with respect to + both estimating the data distribution and generating high-quality samples. Efficient sampling for this + class of models has however remained an elusive problem. With a focus on text-to-speech synthesis, we + describe a set of general techniques for reducing sampling time while maintaining high output quality. + We first describe a single-layer recurrent neural network, the WaveRNN, with a dual softmax layer that + matches the quality of the state-of-the-art WaveNet model. The compact form of the network makes it + possible to generate 24kHz 16-bit audio 4x faster than real time on a GPU. Second, we apply a weight + pruning technique to reduce the number of weights in the WaveRNN. We find that, for a constant number of + parameters, large sparse networks perform better than small dense networks and this relationship holds for + sparsity levels beyond 96%. The small number of weights in a Sparse WaveRNN makes it possible to sample + high-fidelity audio on a mobile CPU in real time. Finally, we propose a new generation scheme based on + subscaling that folds a long sequence into a batch of shorter sequences and allows one to generate multiple + samples at once. The Subscale WaveRNN produces 16 samples per step without loss of quality and offers an + orthogonal method for increasing sampling efficiency. + """ super().__init__() - self.mode = mode - self.mulaw = mulaw - self.pad = pad - self.use_upsample_net = use_upsample_net - self.use_aux_net = use_aux_net - if isinstance(self.mode, int): - self.n_classes = 2 ** self.mode - elif self.mode == "mold": + + self.args = config.model_params + self.config = config + + if isinstance(self.args.mode, int): + self.n_classes = 2 ** self.args.mode + elif self.args.mode == "mold": self.n_classes = 3 * 10 - elif self.mode == "gauss": + elif self.args.mode == "gauss": self.n_classes = 2 else: - raise RuntimeError("Unknown model mode value - ", self.mode) + raise RuntimeError("Unknown model mode value - ", self.args.mode) - self.rnn_dims = rnn_dims - self.aux_dims = res_out_dims // 4 - self.hop_length = hop_length - self.sample_rate = sample_rate + self.aux_dims = self.args.res_out_dims // 4 - if self.use_upsample_net: + if self.args.use_upsample_net: assert ( - np.cumproduct(upsample_factors)[-1] == self.hop_length + np.cumproduct(self.args.upsample_factors)[-1] == config.audio.hop_length ), " [!] upsample scales needs to be equal to hop_length" self.upsample = UpsampleNetwork( - feat_dims, - upsample_factors, - compute_dims, - num_res_blocks, - res_out_dims, - pad, - use_aux_net, + self.args.feat_dims, + self.args.upsample_factors, + self.args.compute_dims, + self.args.num_res_blocks, + self.args.res_out_dims, + self.args.pad, + self.args.use_aux_net, ) else: self.upsample = Upsample( - hop_length, - pad, - num_res_blocks, - feat_dims, - compute_dims, - res_out_dims, - use_aux_net, + config.audio.hop_length, + self.args.pad, + self.args.num_res_blocks, + self.args.feat_dims, + self.args.compute_dims, + self.args.res_out_dims, + self.args.use_aux_net, ) - if self.use_aux_net: - self.I = nn.Linear(feat_dims + self.aux_dims + 1, rnn_dims) - self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True) - self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims, batch_first=True) - self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims) - self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims) - self.fc3 = nn.Linear(fc_dims, self.n_classes) + if self.args.use_aux_net: + self.I = nn.Linear(self.args.feat_dims + self.aux_dims + 1, self.args.rnn_dims) + self.rnn1 = nn.GRU(self.args.rnn_dims, self.args.rnn_dims, batch_first=True) + self.rnn2 = nn.GRU(self.args.rnn_dims + self.aux_dims, self.args.rnn_dims, batch_first=True) + self.fc1 = nn.Linear(self.args.rnn_dims + self.aux_dims, self.args.fc_dims) + self.fc2 = nn.Linear(self.args.fc_dims + self.aux_dims, self.args.fc_dims) + self.fc3 = nn.Linear(self.args.fc_dims, self.n_classes) else: - self.I = nn.Linear(feat_dims + 1, rnn_dims) - self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True) - self.rnn2 = nn.GRU(rnn_dims, rnn_dims, batch_first=True) - self.fc1 = nn.Linear(rnn_dims, fc_dims) - self.fc2 = nn.Linear(fc_dims, fc_dims) - self.fc3 = nn.Linear(fc_dims, self.n_classes) + self.I = nn.Linear(self.args.feat_dims + 1, self.args.rnn_dims) + self.rnn1 = nn.GRU(self.args.rnn_dims, self.args.rnn_dims, batch_first=True) + self.rnn2 = nn.GRU(self.args.rnn_dims, self.args.rnn_dims, batch_first=True) + self.fc1 = nn.Linear(self.args.rnn_dims, self.args.fc_dims) + self.fc2 = nn.Linear(self.args.fc_dims, self.args.fc_dims) + self.fc3 = nn.Linear(self.args.fc_dims, self.n_classes) def forward(self, x, mels): bsize = x.size(0) - h1 = torch.zeros(1, bsize, self.rnn_dims).to(x.device) - h2 = torch.zeros(1, bsize, self.rnn_dims).to(x.device) + h1 = torch.zeros(1, bsize, self.args.rnn_dims).to(x.device) + h2 = torch.zeros(1, bsize, self.args.rnn_dims).to(x.device) mels, aux = self.upsample(mels) - if self.use_aux_net: + if self.args.use_aux_net: aux_idx = [self.aux_dims * i for i in range(5)] a1 = aux[:, :, aux_idx[0] : aux_idx[1]] a2 = aux[:, :, aux_idx[1] : aux_idx[2]] @@ -226,7 +290,7 @@ class WaveRNN(nn.Module): x = ( torch.cat([x.unsqueeze(-1), mels, a1], dim=2) - if self.use_aux_net + if self.args.use_aux_net else torch.cat([x.unsqueeze(-1), mels], dim=2) ) x = self.I(x) @@ -236,15 +300,15 @@ class WaveRNN(nn.Module): x = x + res res = x - x = torch.cat([x, a2], dim=2) if self.use_aux_net else x + x = torch.cat([x, a2], dim=2) if self.args.use_aux_net else x self.rnn2.flatten_parameters() x, _ = self.rnn2(x, h2) x = x + res - x = torch.cat([x, a3], dim=2) if self.use_aux_net else x + x = torch.cat([x, a3], dim=2) if self.args.use_aux_net else x x = F.relu(self.fc1(x)) - x = torch.cat([x, a4], dim=2) if self.use_aux_net else x + x = torch.cat([x, a4], dim=2) if self.args.use_aux_net else x x = F.relu(self.fc2(x)) return self.fc3(x) @@ -262,9 +326,9 @@ class WaveRNN(nn.Module): if mels.ndim == 2: mels = mels.unsqueeze(0) - wave_len = (mels.size(-1) - 1) * self.hop_length + wave_len = (mels.size(-1) - 1) * self.config.audio.hop_length - mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side="both") + mels = self.pad_tensor(mels.transpose(1, 2), pad=self.args.pad, side="both") mels, aux = self.upsample(mels.transpose(1, 2)) if batched: @@ -274,11 +338,11 @@ class WaveRNN(nn.Module): b_size, seq_len, _ = mels.size() - h1 = torch.zeros(b_size, self.rnn_dims).type_as(mels) - h2 = torch.zeros(b_size, self.rnn_dims).type_as(mels) + h1 = torch.zeros(b_size, self.args.rnn_dims).type_as(mels) + h2 = torch.zeros(b_size, self.args.rnn_dims).type_as(mels) x = torch.zeros(b_size, 1).type_as(mels) - if self.use_aux_net: + if self.args.use_aux_net: d = self.aux_dims aux_split = [aux[:, :, d * i : d * (i + 1)] for i in range(4)] @@ -286,35 +350,35 @@ class WaveRNN(nn.Module): m_t = mels[:, i, :] - if self.use_aux_net: + if self.args.use_aux_net: a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split) - x = torch.cat([x, m_t, a1_t], dim=1) if self.use_aux_net else torch.cat([x, m_t], dim=1) + x = torch.cat([x, m_t, a1_t], dim=1) if self.args.use_aux_net else torch.cat([x, m_t], dim=1) x = self.I(x) h1 = rnn1(x, h1) x = x + h1 - inp = torch.cat([x, a2_t], dim=1) if self.use_aux_net else x + inp = torch.cat([x, a2_t], dim=1) if self.args.use_aux_net else x h2 = rnn2(inp, h2) x = x + h2 - x = torch.cat([x, a3_t], dim=1) if self.use_aux_net else x + x = torch.cat([x, a3_t], dim=1) if self.args.use_aux_net else x x = F.relu(self.fc1(x)) - x = torch.cat([x, a4_t], dim=1) if self.use_aux_net else x + x = torch.cat([x, a4_t], dim=1) if self.args.use_aux_net else x x = F.relu(self.fc2(x)) logits = self.fc3(x) - if self.mode == "mold": + if self.args.mode == "mold": sample = sample_from_discretized_mix_logistic(logits.unsqueeze(0).transpose(1, 2)) output.append(sample.view(-1)) x = sample.transpose(0, 1).type_as(mels) - elif self.mode == "gauss": + elif self.args.mode == "gauss": sample = sample_from_gaussian(logits.unsqueeze(0).transpose(1, 2)) output.append(sample.view(-1)) x = sample.transpose(0, 1).type_as(mels) - elif isinstance(self.mode, int): + elif isinstance(self.args.mode, int): posterior = F.softmax(logits, dim=1) distrib = torch.distributions.Categorical(posterior) @@ -322,7 +386,7 @@ class WaveRNN(nn.Module): output.append(sample) x = sample.unsqueeze(-1) else: - raise RuntimeError("Unknown model mode value - ", self.mode) + raise RuntimeError("Unknown model mode value - ", self.args.mode) if i % 100 == 0: self.gen_display(i, seq_len, b_size, start) @@ -337,22 +401,22 @@ class WaveRNN(nn.Module): else: output = output[0] - if self.mulaw and isinstance(self.mode, int): - output = ap.mulaw_decode(output, self.mode) + if self.args.mulaw and isinstance(self.args.mode, int): + output = AudioProcessor.mulaw_decode(output, self.args.mode) # Fade-out at the end to avoid signal cutting out suddenly - fade_out = np.linspace(1, 0, 20 * self.hop_length) + fade_out = np.linspace(1, 0, 20 * self.config.audio.hop_length) output = output[:wave_len] if wave_len > len(fade_out): - output[-20 * self.hop_length :] *= fade_out + output[-20 * self.config.audio.hop_length :] *= fade_out self.train() return output def gen_display(self, i, seq_len, b_size, start): gen_rate = (i + 1) / (time.time() - start) * b_size / 1000 - realtime_ratio = gen_rate * 1000 / self.sample_rate + realtime_ratio = gen_rate * 1000 / self.config.audio.sample_rate stream( "%i/%i -- batch_size: %i -- gen_rate: %.1f kHz -- x_realtime: %.1f ", (i * b_size, seq_len * b_size, b_size, gen_rate, realtime_ratio), @@ -486,3 +550,83 @@ class WaveRNN(nn.Module): if eval: self.eval() assert not self.training + + def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: + mels = batch["input"] + waveform = batch["waveform"] + waveform_coarse = batch["waveform_coarse"] + + y_hat = self.forward(waveform, mels) + if isinstance(self.args.mode, int): + y_hat = y_hat.transpose(1, 2).unsqueeze(-1) + else: + waveform_coarse = waveform_coarse.float() + waveform_coarse = waveform_coarse.unsqueeze(-1) + # compute losses + loss_dict = criterion(y_hat, waveform_coarse) + return {"model_output": y_hat}, loss_dict + + def eval_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: + return self.train_step(batch, criterion) + + @torch.no_grad() + def test_run( + self, ap: AudioProcessor, samples: List[Dict], output: Dict # pylint: disable=unused-argument + ) -> Tuple[Dict, Dict]: + figures = {} + audios = {} + for idx, sample in enumerate(samples): + x = sample["input"] + y_hat = self.inference(x, self.config.batched, self.config.target_samples, self.config.overlap_samples) + x_hat = ap.melspectrogram(y_hat) + figures.update( + { + f"test_{idx}/ground_truth": plot_spectrogram(x.T), + f"test_{idx}/prediction": plot_spectrogram(x_hat.T), + } + ) + audios.update({f"test_{idx}/audio", y_hat}) + return figures, audios + + @staticmethod + def format_batch(batch: Dict) -> Dict: + waveform = batch[0] + mels = batch[1] + waveform_coarse = batch[2] + return {"input": mels, "waveform": waveform, "waveform_coarse": waveform_coarse} + + def get_data_loader( # pylint: disable=no-self-use + self, + config: Coqpit, + ap: AudioProcessor, + is_eval: True, + data_items: List, + verbose: bool, + num_gpus: int, + ): + dataset = WaveRNNDataset( + ap=ap, + items=data_items, + seq_len=config.seq_len, + hop_len=ap.hop_length, + pad=config.model_params.pad, + mode=config.model_params.mode, + mulaw=config.model_params.mulaw, + is_training=not is_eval, + verbose=verbose, + ) + sampler = DistributedSampler(dataset, shuffle=True) if num_gpus > 1 else None + loader = DataLoader( + dataset, + batch_size=1 if is_eval else config.batch_size, + shuffle=num_gpus == 0, + collate_fn=dataset.collate, + sampler=sampler, + num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers, + pin_memory=True, + ) + return loader + + def get_criterion(self): + # define train functions + return WaveRNNLoss(self.args.mode) From d18198dff8e78bea146065086652fe72ec81d1b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:38:01 +0200 Subject: [PATCH 201/258] Implement `setup_model` for vocoder models --- TTS/vocoder/models/__init__.py | 147 +++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) diff --git a/TTS/vocoder/models/__init__.py b/TTS/vocoder/models/__init__.py index e69de29b..cbd3950b 100644 --- a/TTS/vocoder/models/__init__.py +++ b/TTS/vocoder/models/__init__.py @@ -0,0 +1,147 @@ +import importlib +import re + +from coqpit import Coqpit + + +def to_camel(text): + text = text.capitalize() + return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) + + +def setup_model(config: Coqpit): + """Load models directly from configuration.""" + print(" > Vocoder Model: {}".format(config.model)) + if "discriminator_model" in config and "generator_model" in config: + MyModel = importlib.import_module("TTS.vocoder.models.gan") + MyModel = getattr(MyModel, "GAN") + else: + MyModel = importlib.import_module("TTS.vocoder.models." + config.model.lower()) + if config.model.lower() == "wavernn": + MyModel = getattr(MyModel, "Wavernn") + elif config.model.lower() == "gan": + MyModel = getattr(MyModel, "GAN") + elif config.model.lower() == "wavegrad": + MyModel = getattr(MyModel, "Wavegrad") + else: + MyModel = getattr(MyModel, to_camel(config.model)) + raise ValueError(f"Model {config.model} not exist!") + model = MyModel(config) + return model + + +def setup_generator(c): + print(" > Generator Model: {}".format(c.generator_model)) + MyModel = importlib.import_module("TTS.vocoder.models." + c.generator_model.lower()) + MyModel = getattr(MyModel, to_camel(c.generator_model)) + # this is to preserve the Wavernn class name (instead of Wavernn) + if c.generator_model.lower() in "hifigan_generator": + model = MyModel(in_channels=c.audio["num_mels"], out_channels=1, **c.generator_model_params) + elif c.generator_model.lower() in "melgan_generator": + model = MyModel( + in_channels=c.audio["num_mels"], + out_channels=1, + proj_kernel=7, + base_channels=512, + upsample_factors=c.generator_model_params["upsample_factors"], + res_kernel=3, + num_res_blocks=c.generator_model_params["num_res_blocks"], + ) + elif c.generator_model in "melgan_fb_generator": + raise ValueError("melgan_fb_generator is now fullband_melgan_generator") + elif c.generator_model.lower() in "multiband_melgan_generator": + model = MyModel( + in_channels=c.audio["num_mels"], + out_channels=4, + proj_kernel=7, + base_channels=384, + upsample_factors=c.generator_model_params["upsample_factors"], + res_kernel=3, + num_res_blocks=c.generator_model_params["num_res_blocks"], + ) + elif c.generator_model.lower() in "fullband_melgan_generator": + model = MyModel( + in_channels=c.audio["num_mels"], + out_channels=1, + proj_kernel=7, + base_channels=512, + upsample_factors=c.generator_model_params["upsample_factors"], + res_kernel=3, + num_res_blocks=c.generator_model_params["num_res_blocks"], + ) + elif c.generator_model.lower() in "parallel_wavegan_generator": + model = MyModel( + in_channels=1, + out_channels=1, + kernel_size=3, + num_res_blocks=c.generator_model_params["num_res_blocks"], + stacks=c.generator_model_params["stacks"], + res_channels=64, + gate_channels=128, + skip_channels=64, + aux_channels=c.audio["num_mels"], + dropout=0.0, + bias=True, + use_weight_norm=True, + upsample_factors=c.generator_model_params["upsample_factors"], + ) + else: + raise NotImplementedError(f"Model {c.generator_model} not implemented!") + return model + + +def setup_discriminator(c): + print(" > Discriminator Model: {}".format(c.discriminator_model)) + if "parallel_wavegan" in c.discriminator_model: + MyModel = importlib.import_module("TTS.vocoder.models.parallel_wavegan_discriminator") + else: + MyModel = importlib.import_module("TTS.vocoder.models." + c.discriminator_model.lower()) + MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower())) + if c.discriminator_model in "hifigan_discriminator": + model = MyModel() + if c.discriminator_model in "random_window_discriminator": + model = MyModel( + cond_channels=c.audio["num_mels"], + hop_length=c.audio["hop_length"], + uncond_disc_donwsample_factors=c.discriminator_model_params["uncond_disc_donwsample_factors"], + cond_disc_downsample_factors=c.discriminator_model_params["cond_disc_downsample_factors"], + cond_disc_out_channels=c.discriminator_model_params["cond_disc_out_channels"], + window_sizes=c.discriminator_model_params["window_sizes"], + ) + if c.discriminator_model in "melgan_multiscale_discriminator": + model = MyModel( + in_channels=1, + out_channels=1, + kernel_sizes=(5, 3), + base_channels=c.discriminator_model_params["base_channels"], + max_channels=c.discriminator_model_params["max_channels"], + downsample_factors=c.discriminator_model_params["downsample_factors"], + ) + if c.discriminator_model == "residual_parallel_wavegan_discriminator": + model = MyModel( + in_channels=1, + out_channels=1, + kernel_size=3, + num_layers=c.discriminator_model_params["num_layers"], + stacks=c.discriminator_model_params["stacks"], + res_channels=64, + gate_channels=128, + skip_channels=64, + dropout=0.0, + bias=True, + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}, + ) + if c.discriminator_model == "parallel_wavegan_discriminator": + model = MyModel( + in_channels=1, + out_channels=1, + kernel_size=3, + num_layers=c.discriminator_model_params["num_layers"], + conv_channels=64, + dilation_factor=1, + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}, + bias=True, + ) + return model From d7225eedb0d8685395db0a58b434819b731d183c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:42:36 +0200 Subject: [PATCH 202/258] Update `vocoder` datasets and `setup_dataset` --- TTS/vocoder/datasets/__init__.py | 57 ++++++++++++++++++++++++ TTS/vocoder/datasets/preprocess.py | 17 ++++++- TTS/vocoder/datasets/wavegrad_dataset.py | 2 +- TTS/vocoder/datasets/wavernn_dataset.py | 27 ++++++----- 4 files changed, 89 insertions(+), 14 deletions(-) diff --git a/TTS/vocoder/datasets/__init__.py b/TTS/vocoder/datasets/__init__.py index e69de29b..86b059c3 100644 --- a/TTS/vocoder/datasets/__init__.py +++ b/TTS/vocoder/datasets/__init__.py @@ -0,0 +1,57 @@ +from typing import List + +from coqpit import Coqpit +from torch.utils.data import Dataset + +from TTS.utils.audio import AudioProcessor +from TTS.vocoder.datasets.gan_dataset import GANDataset +from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset +from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset + + +def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: List, verbose: bool) -> Dataset: + if config.model.lower() in "gan": + dataset = GANDataset( + ap=ap, + items=data_items, + seq_len=config.seq_len, + hop_len=ap.hop_length, + pad_short=config.pad_short, + conv_pad=config.conv_pad, + return_pairs=config.diff_samples_for_G_and_D if "diff_samples_for_G_and_D" in config else False, + is_training=not is_eval, + return_segments=not is_eval, + use_noise_augment=config.use_noise_augment, + use_cache=config.use_cache, + verbose=verbose, + ) + dataset.shuffle_mapping() + elif config.model.lower() == "wavegrad": + dataset = WaveGradDataset( + ap=ap, + items=data_items, + seq_len=config.seq_len, + hop_len=ap.hop_length, + pad_short=config.pad_short, + conv_pad=config.conv_pad, + is_training=not is_eval, + return_segments=True, + use_noise_augment=False, + use_cache=config.use_cache, + verbose=verbose, + ) + elif config.model.lower() == "wavernn": + dataset = WaveRNNDataset( + ap=ap, + items=data_items, + seq_len=config.seq_len, + hop_len=ap.hop_length, + pad=config.model_params.pad, + mode=config.model_params.mode, + mulaw=config.model_params.mulaw, + is_training=not is_eval, + verbose=verbose, + ) + else: + raise ValueError(f" [!] Dataset for model {config.model.lower()} cannot be found.") + return dataset diff --git a/TTS/vocoder/datasets/preprocess.py b/TTS/vocoder/datasets/preprocess.py index d99ee147..c4569b3d 100644 --- a/TTS/vocoder/datasets/preprocess.py +++ b/TTS/vocoder/datasets/preprocess.py @@ -3,10 +3,21 @@ import os from pathlib import Path import numpy as np +from coqpit import Coqpit from tqdm import tqdm +from TTS.utils.audio import AudioProcessor -def preprocess_wav_files(out_path, config, ap): + +def preprocess_wav_files(out_path: str, config: Coqpit, ap: AudioProcessor): + """Process wav and compute mel and quantized wave signal. + It is mainly used by WaveRNN dataloader. + + Args: + out_path (str): Parent folder path to save the files. + config (Coqpit): Model config. + ap (AudioProcessor): Audio processor. + """ os.makedirs(os.path.join(out_path, "quant"), exist_ok=True) os.makedirs(os.path.join(out_path, "mel"), exist_ok=True) wav_files = find_wav_files(config.data_path) @@ -18,7 +29,9 @@ def preprocess_wav_files(out_path, config, ap): mel = ap.melspectrogram(y) np.save(mel_path, mel) if isinstance(config.mode, int): - quant = ap.mulaw_encode(y, qc=config.mode) if config.mulaw else ap.quantize(y, bits=config.mode) + quant = ( + ap.mulaw_encode(y, qc=config.mode) if config.model_params.mulaw else ap.quantize(y, bits=config.mode) + ) np.save(quant_path, quant) diff --git a/TTS/vocoder/datasets/wavegrad_dataset.py b/TTS/vocoder/datasets/wavegrad_dataset.py index c0d24e84..d99fc417 100644 --- a/TTS/vocoder/datasets/wavegrad_dataset.py +++ b/TTS/vocoder/datasets/wavegrad_dataset.py @@ -136,4 +136,4 @@ class WaveGradDataset(Dataset): mels[idx, :, : mel.shape[1]] = mel audios[idx, : audio.shape[0]] = audio - return mels, audios + return audios, mels diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 1596ea8f..d648b68c 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -10,16 +10,7 @@ class WaveRNNDataset(Dataset): """ def __init__( - self, - ap, - items, - seq_len, - hop_len, - pad, - mode, - mulaw, - is_training=True, - verbose=False, + self, ap, items, seq_len, hop_len, pad, mode, mulaw, is_training=True, verbose=False, return_segments=True ): super().__init__() @@ -34,6 +25,7 @@ class WaveRNNDataset(Dataset): self.mulaw = mulaw self.is_training = is_training self.verbose = verbose + self.return_segments = return_segments assert self.seq_len % self.hop_len == 0 @@ -44,6 +36,16 @@ class WaveRNNDataset(Dataset): item = self.load_item(index) return item + def load_test_samples(self, num_samples): + samples = [] + return_segments = self.return_segments + self.return_segments = False + for idx in range(num_samples): + mel, audio, _ = self.load_item(idx) + samples.append([mel, audio]) + self.return_segments = return_segments + return samples + def load_item(self, index): """ load (audio, feat) couple if feature_path is set @@ -53,7 +55,10 @@ class WaveRNNDataset(Dataset): wavpath = self.item_list[index] audio = self.ap.load_wav(wavpath) - min_audio_len = 2 * self.seq_len + (2 * self.pad * self.hop_len) + if self.return_segments: + min_audio_len = 2 * self.seq_len + (2 * self.pad * self.hop_len) + else: + min_audio_len = audio.shape[0] + (2 * self.pad * self.hop_len) if audio.shape[0] < min_audio_len: print(" [!] Instance is too short! : {}".format(wavpath)) audio = np.pad(audio, [0, min_audio_len - audio.shape[0] + self.hop_len]) From 45947acb603e788cc730116e0a70a9ba93221a4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:44:02 +0200 Subject: [PATCH 203/258] Update `TTS.bin` scripts for the new API --- TTS/bin/compute_attention_masks.py | 2 +- TTS/bin/compute_statistics.py | 2 +- TTS/bin/convert_tacotron2_torch_to_tf.py | 6 +++--- TTS/bin/distribute.py | 21 +++++---------------- TTS/bin/extract_tts_spectrograms.py | 17 ++++------------- 5 files changed, 14 insertions(+), 34 deletions(-) diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index eb708040..35721f59 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -75,7 +75,7 @@ Example run: # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) # TODO: handle multi-speaker - model = setup_model(num_chars, num_speakers=0, c=C) + model = setup_model(C) model, _ = load_checkpoint(model, args.model_path, None, args.use_cuda) model.eval() diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index 25e3fce5..6179dafc 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -77,7 +77,7 @@ def main(): print(f" > Avg mel spec mean: {mel_mean.mean()}") print(f" > Avg mel spec scale: {mel_scale.mean()}") print(f" > Avg linear spec mean: {linear_mean.mean()}") - print(f" > Avg lienar spec scale: {linear_scale.mean()}") + print(f" > Avg linear spec scale: {linear_scale.mean()}") # set default config values for mean-var scaling CONFIG.audio.stats_path = output_file_path diff --git a/TTS/bin/convert_tacotron2_torch_to_tf.py b/TTS/bin/convert_tacotron2_torch_to_tf.py index 119529ae..a6fb5d9b 100644 --- a/TTS/bin/convert_tacotron2_torch_to_tf.py +++ b/TTS/bin/convert_tacotron2_torch_to_tf.py @@ -31,18 +31,18 @@ c = load_config(config_path) num_speakers = 0 # init torch model -num_chars = len(phonemes) if c.use_phonemes else len(symbols) -model = setup_model(num_chars, num_speakers, c) +model = setup_model(c) checkpoint = torch.load(args.torch_model_path, map_location=torch.device("cpu")) state_dict = checkpoint["model"] model.load_state_dict(state_dict) # init tf model +num_chars = len(phonemes) if c.use_phonemes else len(symbols) model_tf = Tacotron2( num_chars=num_chars, num_speakers=num_speakers, r=model.decoder.r, - postnet_output_dim=c.audio["num_mels"], + out_channels=c.audio["num_mels"], decoder_output_dim=c.audio["num_mels"], attn_type=c.attention_type, attn_win=c.windowing, diff --git a/TTS/bin/distribute.py b/TTS/bin/distribute.py index 20d4bb20..873ddb1f 100644 --- a/TTS/bin/distribute.py +++ b/TTS/bin/distribute.py @@ -1,36 +1,24 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -import argparse import os import pathlib import subprocess -import sys import time import torch +from TTS.trainer import TrainingArgs + def main(): """ Call train.py as a new process and pass command arguments """ - parser = argparse.ArgumentParser() + parser = TrainingArgs().init_argparse(arg_prefix="") parser.add_argument("--script", type=str, help="Target training script to distibute.") - parser.add_argument( - "--continue_path", - type=str, - help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default="", - required="--config_path" not in sys.argv, - ) - parser.add_argument( - "--restore_path", type=str, help="Model file to be restored. Use to finetune a model.", default="" - ) - parser.add_argument( - "--config_path", type=str, help="Path to config file for training.", required="--continue_path" not in sys.argv - ) args, unargs = parser.parse_known_args() + breakpoint() num_gpus = torch.cuda.device_count() group_id = time.strftime("%Y_%m_%d-%H%M%S") @@ -51,6 +39,7 @@ def main(): my_env = os.environ.copy() my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i) command[-1] = "--rank={}".format(i) + # prevent stdout for processes with rank != 0 stdout = None if i == 0 else open(os.devnull, "w") p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env) # pylint: disable=consider-using-with processes.append(p) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 975f29d9..11cdfe31 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -14,7 +14,6 @@ from TTS.tts.datasets import load_meta_data from TTS.tts.datasets.TTSDataset import TTSDataset from TTS.tts.models import setup_model from TTS.tts.utils.speakers import get_speaker_manager -from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import count_parameters @@ -40,9 +39,7 @@ def setup_loader(ap, r, verbose=False): use_noise_augment=False, verbose=verbose, speaker_id_mapping=speaker_manager.speaker_ids, - d_vector_mapping=speaker_manager.d_vectors - if c.use_speaker_embedding and c.use_external_speaker_embedding_file - else None, + d_vector_mapping=speaker_manager.d_vectors if c.use_speaker_embedding and c.use_d_vector_file else None, ) if c.use_phonemes and c.compute_input_seq_cache: @@ -224,16 +221,10 @@ def extract_spectrograms( def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined - global meta_data, symbols, phonemes, model_characters, speaker_manager + global meta_data, speaker_manager # Audio processor ap = AudioProcessor(**c.audio) - if "characters" in c.keys() and c["characters"]: - symbols, phonemes = make_symbols(**c.characters) - - # set model characters - model_characters = phonemes if c.use_phonemes else symbols - num_chars = len(model_characters) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) @@ -245,7 +236,7 @@ def main(args): # pylint: disable=redefined-outer-name speaker_manager = get_speaker_manager(c, args, meta_data_train) # setup model - model = setup_model(num_chars, speaker_manager.num_speakers, c, d_vector_dim=speaker_manager.d_vector_dim) + model = setup_model(c) # restore model checkpoint = torch.load(args.checkpoint_path, map_location="cpu") @@ -283,5 +274,5 @@ if __name__ == "__main__": args = parser.parse_args() c = load_config(args.config_path) - c.audio["do_trim_silence"] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel + c.audio.trim_silence = False main(args) From 106b63d8a99361056114981719dbd6b42f1d1f61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:45:52 +0200 Subject: [PATCH 204/258] Update `vocoder` utils --- TTS/vocoder/layers/losses.py | 25 ++++- TTS/vocoder/utils/generic_utils.py | 166 +---------------------------- 2 files changed, 25 insertions(+), 166 deletions(-) diff --git a/TTS/vocoder/layers/losses.py b/TTS/vocoder/layers/losses.py index 18076d85..9acdeea1 100644 --- a/TTS/vocoder/layers/losses.py +++ b/TTS/vocoder/layers/losses.py @@ -1,8 +1,12 @@ +from typing import Dict, Union + import librosa import torch from torch import nn from torch.nn import functional as F +from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss + class TorchSTFT(nn.Module): # pylint: disable=abstract-method """TODO: Merge this with audio.py""" @@ -374,7 +378,7 @@ class GeneratorLoss(nn.Module): feat_match_loss = self.feat_match_loss(feats_fake, feats_real) return_dict["G_feat_match_loss"] = feat_match_loss adv_loss = adv_loss + self.feat_match_loss_weight * feat_match_loss - return_dict["G_loss"] = gen_loss + adv_loss + return_dict["loss"] = gen_loss + adv_loss return_dict["G_gen_loss"] = gen_loss return_dict["G_adv_loss"] = adv_loss return return_dict @@ -419,5 +423,22 @@ class DiscriminatorLoss(nn.Module): return_dict["D_hinge_gan_fake_loss"] = hinge_D_fake_loss loss += hinge_D_loss - return_dict["D_loss"] = loss + return_dict["loss"] = loss return return_dict + + +class WaveRNNLoss(nn.Module): + def __init__(self, wave_rnn_mode: Union[str, int]): + super().__init__() + if wave_rnn_mode == "mold": + self.loss_func = discretized_mix_logistic_loss + elif wave_rnn_mode == "gauss": + self.loss_func = gaussian_loss + elif isinstance(wave_rnn_mode, int): + self.loss_func = torch.nn.CrossEntropyLoss() + else: + raise ValueError(" [!] Unknown mode for Wavernn.") + + def forward(self, y_hat, y) -> Dict: + loss = self.loss_func(y_hat, y) + return {"loss": loss} diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index cb45feb0..eeabbea5 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -1,6 +1,3 @@ -import importlib -import re - import numpy as np import torch from matplotlib import pyplot as plt @@ -29,7 +26,7 @@ def interpolate_vocoder_input(scale_factor, spec): return spec -def plot_results(y_hat, y, ap, global_step, name_prefix): +def plot_results(y_hat, y, ap, name_prefix): """Plot vocoder model results""" # select an instance from batch @@ -47,7 +44,7 @@ def plot_results(y_hat, y, ap, global_step, name_prefix): plt.title("groundtruth speech") plt.subplot(2, 1, 2) plt.plot(y_hat) - plt.title(f"generated speech @ {global_step} steps") + plt.title("generated speech") plt.tight_layout() plt.close() @@ -58,162 +55,3 @@ def plot_results(y_hat, y, ap, global_step, name_prefix): name_prefix + "speech_comparison": fig_wave, } return figures - - -def to_camel(text): - text = text.capitalize() - return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) - - -def setup_generator(c): - print(" > Generator Model: {}".format(c.generator_model)) - MyModel = importlib.import_module("TTS.vocoder.models." + c.generator_model.lower()) - # this is to preserve the WaveRNN class name (instead of Wavernn) - if c.generator_model.lower() == "wavernn": - MyModel = getattr(MyModel, "WaveRNN") - else: - MyModel = getattr(MyModel, to_camel(c.generator_model)) - if c.generator_model.lower() in "wavernn": - model = MyModel( - rnn_dims=c.wavernn_model_params["rnn_dims"], - fc_dims=c.wavernn_model_params["fc_dims"], - mode=c.mode, - mulaw=c.mulaw, - pad=c.padding, - use_aux_net=c.wavernn_model_params["use_aux_net"], - use_upsample_net=c.wavernn_model_params["use_upsample_net"], - upsample_factors=c.wavernn_model_params["upsample_factors"], - feat_dims=c.audio["num_mels"], - compute_dims=c.wavernn_model_params["compute_dims"], - res_out_dims=c.wavernn_model_params["res_out_dims"], - num_res_blocks=c.wavernn_model_params["num_res_blocks"], - hop_length=c.audio["hop_length"], - sample_rate=c.audio["sample_rate"], - ) - elif c.generator_model.lower() in "hifigan_generator": - model = MyModel(in_channels=c.audio["num_mels"], out_channels=1, **c.generator_model_params) - elif c.generator_model.lower() in "melgan_generator": - model = MyModel( - in_channels=c.audio["num_mels"], - out_channels=1, - proj_kernel=7, - base_channels=512, - upsample_factors=c.generator_model_params["upsample_factors"], - res_kernel=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - ) - elif c.generator_model in "melgan_fb_generator": - raise ValueError("melgan_fb_generator is now fullband_melgan_generator") - elif c.generator_model.lower() in "multiband_melgan_generator": - model = MyModel( - in_channels=c.audio["num_mels"], - out_channels=4, - proj_kernel=7, - base_channels=384, - upsample_factors=c.generator_model_params["upsample_factors"], - res_kernel=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - ) - elif c.generator_model.lower() in "fullband_melgan_generator": - model = MyModel( - in_channels=c.audio["num_mels"], - out_channels=1, - proj_kernel=7, - base_channels=512, - upsample_factors=c.generator_model_params["upsample_factors"], - res_kernel=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - ) - elif c.generator_model.lower() in "parallel_wavegan_generator": - model = MyModel( - in_channels=1, - out_channels=1, - kernel_size=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - stacks=c.generator_model_params["stacks"], - res_channels=64, - gate_channels=128, - skip_channels=64, - aux_channels=c.audio["num_mels"], - dropout=0.0, - bias=True, - use_weight_norm=True, - upsample_factors=c.generator_model_params["upsample_factors"], - ) - elif c.generator_model.lower() in "wavegrad": - model = MyModel( - in_channels=c["audio"]["num_mels"], - out_channels=1, - use_weight_norm=c["model_params"]["use_weight_norm"], - x_conv_channels=c["model_params"]["x_conv_channels"], - y_conv_channels=c["model_params"]["y_conv_channels"], - dblock_out_channels=c["model_params"]["dblock_out_channels"], - ublock_out_channels=c["model_params"]["ublock_out_channels"], - upsample_factors=c["model_params"]["upsample_factors"], - upsample_dilations=c["model_params"]["upsample_dilations"], - ) - else: - raise NotImplementedError(f"Model {c.generator_model} not implemented!") - return model - - -def setup_discriminator(c): - print(" > Discriminator Model: {}".format(c.discriminator_model)) - if "parallel_wavegan" in c.discriminator_model: - MyModel = importlib.import_module("TTS.vocoder.models.parallel_wavegan_discriminator") - else: - MyModel = importlib.import_module("TTS.vocoder.models." + c.discriminator_model.lower()) - MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower())) - if c.discriminator_model in "hifigan_discriminator": - model = MyModel() - if c.discriminator_model in "random_window_discriminator": - model = MyModel( - cond_channels=c.audio["num_mels"], - hop_length=c.audio["hop_length"], - uncond_disc_donwsample_factors=c.discriminator_model_params["uncond_disc_donwsample_factors"], - cond_disc_downsample_factors=c.discriminator_model_params["cond_disc_downsample_factors"], - cond_disc_out_channels=c.discriminator_model_params["cond_disc_out_channels"], - window_sizes=c.discriminator_model_params["window_sizes"], - ) - if c.discriminator_model in "melgan_multiscale_discriminator": - model = MyModel( - in_channels=1, - out_channels=1, - kernel_sizes=(5, 3), - base_channels=c.discriminator_model_params["base_channels"], - max_channels=c.discriminator_model_params["max_channels"], - downsample_factors=c.discriminator_model_params["downsample_factors"], - ) - if c.discriminator_model == "residual_parallel_wavegan_discriminator": - model = MyModel( - in_channels=1, - out_channels=1, - kernel_size=3, - num_layers=c.discriminator_model_params["num_layers"], - stacks=c.discriminator_model_params["stacks"], - res_channels=64, - gate_channels=128, - skip_channels=64, - dropout=0.0, - bias=True, - nonlinear_activation="LeakyReLU", - nonlinear_activation_params={"negative_slope": 0.2}, - ) - if c.discriminator_model == "parallel_wavegan_discriminator": - model = MyModel( - in_channels=1, - out_channels=1, - kernel_size=3, - num_layers=c.discriminator_model_params["num_layers"], - conv_channels=64, - dilation_factor=1, - nonlinear_activation="LeakyReLU", - nonlinear_activation_params={"negative_slope": 0.2}, - bias=True, - ) - return model - - -# def check_config(c): -# c = None -# pass From e53616078af4074b67f6f2d8e5182d43d3679541 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 14:26:11 +0200 Subject: [PATCH 205/258] Fixup `utils` for the trainer --- TTS/utils/generic_utils.py | 14 ++++++-------- TTS/utils/logging/tensorboard_logger.py | 2 ++ TTS/utils/manage.py | 2 +- TTS/utils/radam.py | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 67cd0bf5..e7c57529 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -16,9 +16,10 @@ import torch def to_cuda(x: torch.Tensor) -> torch.Tensor: if x is None: return None - x = x.contiguous() - if torch.cuda.is_available(): - x = x.cuda(non_blocking=True) + if torch.is_tensor(x): + x = x.contiguous() + if torch.cuda.is_available(): + x = x.cuda(non_blocking=True) return x @@ -57,13 +58,10 @@ def get_commit_hash(): return commit -def create_experiment_folder(root_path, model_name, debug): +def create_experiment_folder(root_path, model_name): """Create a folder with the current date and time""" date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p") - if debug: - commit_hash = "debug" - else: - commit_hash = get_commit_hash() + commit_hash = get_commit_hash() output_folder = os.path.join(root_path, model_name + "-" + date_str + "-" + commit_hash) os.makedirs(output_folder, exist_ok=True) print(" > Experiment folder: {}".format(output_folder)) diff --git a/TTS/utils/logging/tensorboard_logger.py b/TTS/utils/logging/tensorboard_logger.py index 657deb5b..3d7ea1e6 100644 --- a/TTS/utils/logging/tensorboard_logger.py +++ b/TTS/utils/logging/tensorboard_logger.py @@ -34,6 +34,8 @@ class TensorboardLogger(object): def dict_to_tb_audios(self, scope_name, audios, step, sample_rate): for key, value in audios.items(): + if value.dtype == "float16": + value = value.astype("float32") try: self.writer.add_audio("{}/{}".format(scope_name, key), value, step, sample_rate=sample_rate) except RuntimeError: diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index f5165079..93497517 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -137,7 +137,7 @@ class ModelManager(object): # set scale stats path in config.json config_path = output_config_path config = load_config(config_path) - config.external_speaker_embedding_file = output_speakers_path + config.d_vector_file = output_speakers_path config.save_json(config_path) return output_model_path, output_config_path, model_item diff --git a/TTS/utils/radam.py b/TTS/utils/radam.py index b6c86fed..73426e64 100644 --- a/TTS/utils/radam.py +++ b/TTS/utils/radam.py @@ -1,4 +1,4 @@ -# from https://github.com/LiyuanLucasLiu/RAdam +# modified from https://github.com/LiyuanLucasLiu/RAdam import math From f23b228e24146aca92d8ed9c669203f93f99ba4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 14:26:41 +0200 Subject: [PATCH 206/258] Update `speaker_manager` --- TTS/tts/utils/speakers.py | 167 ++++++++++++++++++++------------------ 1 file changed, 88 insertions(+), 79 deletions(-) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 01e26c6b..5caa2fee 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -5,91 +5,13 @@ from typing import Any, Dict, List, Tuple, Union import numpy as np import torch +from coqpit import Coqpit from TTS.config import load_config from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.utils.audio import AudioProcessor -def _set_file_path(path): - """Find the speakers.json under the given path or the above it. - Intended to band aid the different paths returned in restored and continued training.""" - path_restore = os.path.join(os.path.dirname(path), "speakers.json") - path_continue = os.path.join(path, "speakers.json") - if os.path.exists(path_restore): - return path_restore - if os.path.exists(path_continue): - return path_continue - raise FileNotFoundError(f" [!] `speakers.json` not found in {path}") - - -def load_speaker_mapping(out_path): - """Loads speaker mapping if already present.""" - if os.path.splitext(out_path)[1] == ".json": - json_file = out_path - else: - json_file = _set_file_path(out_path) - with open(json_file) as f: - return json.load(f) - - -def save_speaker_mapping(out_path, speaker_mapping): - """Saves speaker mapping if not yet present.""" - if out_path is not None: - speakers_json_path = _set_file_path(out_path) - with open(speakers_json_path, "w") as f: - json.dump(speaker_mapping, f, indent=4) - - -def get_speaker_manager(c, restore_path, meta_data_train, out_path=None): - """Inititalize and return a `SpeakerManager` based on config values""" - speaker_manager = SpeakerManager() - if c.use_speaker_embedding: - speaker_manager.set_speaker_ids_from_data(meta_data_train) - if restore_path: - speakers_file = _set_file_path(restore_path) - # restoring speaker manager from a previous run. - if c.use_external_speaker_embedding_file: - # restore speaker manager with the embedding file - if not os.path.exists(speakers_file): - print( - "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" - ) - if not os.path.exists(c.external_speaker_embedding_file): - raise RuntimeError( - "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file" - ) - speaker_manager.load_d_vectors_file(c.external_speaker_embedding_file) - speaker_manager.set_d_vectors_from_file(speakers_file) - elif not c.use_external_speaker_embedding_file: # restor speaker manager with speaker ID file. - speaker_ids_from_data = speaker_manager.speaker_ids - speaker_manager.set_speaker_ids_from_file(speakers_file) - assert all( - speaker in speaker_manager.speaker_ids for speaker in speaker_ids_from_data - ), " [!] You cannot introduce new speakers to a pre-trained model." - elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: - # new speaker manager with external speaker embeddings. - speaker_manager.set_d_vectors_from_file(c.external_speaker_embedding_file) - elif ( - c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file - ): # new speaker manager with speaker IDs file. - raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" - print( - " > Training with {} speakers: {}".format( - speaker_manager.num_speakers, ", ".join(speaker_manager.speaker_ids) - ) - ) - # save file if path is defined - if out_path: - out_file_path = os.path.join(out_path, "speakers.json") - print(f" > Saving `speakers.json` to {out_file_path}.") - if c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: - speaker_manager.save_d_vectors_to_file(out_file_path) - else: - speaker_manager.save_speaker_ids_to_file(out_file_path) - return speaker_manager - - class SpeakerManager: """It manages the multi-speaker setup for 🐸TTS models. It loads the speaker files and parses the information in a way that you can query. There are 3 different scenarios considered. @@ -356,3 +278,90 @@ class SpeakerManager: def plot_embeddings(self): # TODO: implement speaker encoder raise NotImplementedError + + +def _set_file_path(path): + """Find the speakers.json under the given path or the above it. + Intended to band aid the different paths returned in restored and continued training.""" + path_restore = os.path.join(os.path.dirname(path), "speakers.json") + path_continue = os.path.join(path, "speakers.json") + if os.path.exists(path_restore): + return path_restore + if os.path.exists(path_continue): + return path_continue + raise FileNotFoundError(f" [!] `speakers.json` not found in {path}") + + +def load_speaker_mapping(out_path): + """Loads speaker mapping if already present.""" + if os.path.splitext(out_path)[1] == ".json": + json_file = out_path + else: + json_file = _set_file_path(out_path) + with open(json_file) as f: + return json.load(f) + + +def save_speaker_mapping(out_path, speaker_mapping): + """Saves speaker mapping if not yet present.""" + if out_path is not None: + speakers_json_path = _set_file_path(out_path) + with open(speakers_json_path, "w") as f: + json.dump(speaker_mapping, f, indent=4) + + +def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, out_path: str = None) -> SpeakerManager: + """Create a SpeakerManager instance based on provided configuration. + + Args: + c (Coqpit): Model configuration. + restore_path (str): Path to a previous training folder. + data (List): Data samples used in training to infer speakers from. It must be provided if speaker embedding + layers is used. Defaults to None. + out_path (str, optional): Save the generated speaker IDs to a output path. Defaults to None. + + Returns: + SpeakerManager: + """ + speaker_manager = SpeakerManager() + if c.use_speaker_embedding: + if data is not None: + speaker_manager.set_speaker_ids_from_data(data) + if restore_path: + speakers_file = _set_file_path(restore_path) + # restoring speaker manager from a previous run. + if c.use_d_vector_file: + # restore speaker manager with the embedding file + if not os.path.exists(speakers_file): + print("WARNING: speakers.json was not found in restore_path, trying to use CONFIG.d_vector_file") + if not os.path.exists(c.d_vector_file): + raise RuntimeError( + "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.d_vector_file" + ) + speaker_manager.load_d_vectors_file(c.d_vector_file) + speaker_manager.set_d_vectors_from_file(speakers_file) + elif not c.use_d_vector_file: # restor speaker manager with speaker ID file. + speaker_ids_from_data = speaker_manager.speaker_ids + speaker_manager.set_speaker_ids_from_file(speakers_file) + assert all( + speaker in speaker_manager.speaker_ids for speaker in speaker_ids_from_data + ), " [!] You cannot introduce new speakers to a pre-trained model." + elif c.use_d_vector_file and c.d_vector_file: + # new speaker manager with external speaker embeddings. + speaker_manager.set_d_vectors_from_file(c.d_vector_file) + elif c.use_d_vector_file and not c.d_vector_file: # new speaker manager with speaker IDs file. + raise "use_d_vector_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" + print( + " > Training with {} speakers: {}".format( + speaker_manager.num_speakers, ", ".join(speaker_manager.speaker_ids) + ) + ) + # save file if path is defined + if out_path: + out_file_path = os.path.join(out_path, "speakers.json") + print(f" > Saving `speakers.json` to {out_file_path}.") + if c.use_d_vector_file and c.d_vector_file: + speaker_manager.save_d_vectors_to_file(out_file_path) + else: + speaker_manager.save_speaker_ids_to_file(out_file_path) + return speaker_manager From 15fa31b595d58f20cf388f047701172c449631e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 14:27:28 +0200 Subject: [PATCH 207/258] fixup configs --- TTS/config/shared_configs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index a7976db7..801855c1 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -1,7 +1,7 @@ from dataclasses import asdict, dataclass from typing import List -from coqpit import MISSING, Coqpit, check_argument +from coqpit import Coqpit, check_argument @dataclass @@ -214,7 +214,7 @@ class BaseTrainingConfig(Coqpit): to 10000. num_loader_workers (int): Number of workers for training time dataloader. - num_val_loader_workers (int): + num_eval_loader_workers (int): Number of workers for evaluation time dataloader. output_path (str): Path for training output folder. The nonexist part of the given path is created automatically. @@ -243,8 +243,8 @@ class BaseTrainingConfig(Coqpit): keep_all_best: bool = False keep_after: int = 10000 # dataloading - num_loader_workers: int = MISSING - num_val_loader_workers: int = 0 + num_loader_workers: int = None + num_eval_loader_workers: int = 0 use_noise_augment: bool = False # paths output_path: str = None From e30f245e061256764ab6c580286c1a6c8841826c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 14:49:06 +0200 Subject: [PATCH 208/258] Update `synthesizer` for speaker and model init --- TTS/utils/synthesizer.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 8f510f20..365ab8bd 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -6,7 +6,7 @@ import pysbd import torch from TTS.config import load_config -from TTS.tts.models import setup_model +from TTS.tts.models import setup_model as setup_tts_model from TTS.tts.utils.speakers import SpeakerManager # pylint: disable=unused-wildcard-import @@ -14,7 +14,8 @@ from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.synthesis import synthesis, trim_silence from TTS.tts.utils.text import make_symbols, phonemes, symbols from TTS.utils.audio import AudioProcessor -from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input, setup_generator +from TTS.vocoder.models import setup_model as setup_vocoder_model +from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input class Synthesizer(object): @@ -98,7 +99,7 @@ class Synthesizer(object): self.speaker_manager = SpeakerManager( encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config ) - self.speaker_manager.load_d_vectors_file(self.tts_config.get("external_speaker_embedding_file", speaker_file)) + self.speaker_manager.load_d_vectors_file(self.tts_config.get("d_vector_file", speaker_file)) self.num_speakers = self.speaker_manager.num_speakers self.d_vector_dim = self.speaker_manager.d_vector_dim @@ -127,16 +128,11 @@ class Synthesizer(object): if self.tts_config.use_speaker_embedding is True: self.tts_speakers_file = ( - self.tts_speakers_file if self.tts_speakers_file else self.tts_config["external_speaker_embedding_file"] + self.tts_speakers_file if self.tts_speakers_file else self.tts_config["d_vector_file"] ) - self._load_speakers(self.tts_speakers_file) + self.tts_config["d_vector_file"] = self.tts_speakers_file - self.tts_model = setup_model( - self.input_size, - num_speakers=self.num_speakers, - c=self.tts_config, - d_vector_dim=self.d_vector_dim, - ) + self.tts_model = setup_tts_model(config=self.tts_config) self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda() @@ -151,7 +147,7 @@ class Synthesizer(object): """ self.vocoder_config = load_config(model_config) self.vocoder_ap = AudioProcessor(verbose=False, **self.vocoder_config.audio) - self.vocoder_model = setup_generator(self.vocoder_config) + self.vocoder_model = setup_vocoder_model(self.vocoder_config) self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True) if use_cuda: self.vocoder_model.cuda() From a5d5bc90631a344d2ce4f1d1cde70c637d9aac3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 14:52:27 +0200 Subject: [PATCH 209/258] Print `max_decoder_steps` when model reaches the limit --- TTS/tts/layers/tacotron/tacotron2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/layers/tacotron/tacotron2.py b/TTS/tts/layers/tacotron/tacotron2.py index 61fe9f4b..9c33623e 100644 --- a/TTS/tts/layers/tacotron/tacotron2.py +++ b/TTS/tts/layers/tacotron/tacotron2.py @@ -357,7 +357,7 @@ class Decoder(nn.Module): if stop_token > self.stop_threshold and t > inputs.shape[0] // 2: break if len(outputs) == self.max_decoder_steps: - print(" | > Decoder stopped with 'max_decoder_steps") + print(f" > Decoder stopped with `max_decoder_steps` {self.max_decoder_steps}") break memory = self._update_memory(decoder_output) From 9455a2b01e81ed6d61c72a0bac903c41c1f34dfc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 14:54:04 +0200 Subject: [PATCH 210/258] Apply small fixes for API compatibility --- TTS/tts/tf/utils/generic_utils.py | 2 +- notebooks/PlotUmapLibriTTS.ipynb | 4 +++- notebooks/TestAttention.ipynb | 4 +++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/TTS/tts/tf/utils/generic_utils.py b/TTS/tts/tf/utils/generic_utils.py index e76893c2..91434a38 100644 --- a/TTS/tts/tf/utils/generic_utils.py +++ b/TTS/tts/tf/utils/generic_utils.py @@ -83,7 +83,7 @@ def setup_model(num_chars, num_speakers, c, enable_tflite=False): num_chars=num_chars, num_speakers=num_speakers, r=c.r, - postnet_output_dim=c.audio["num_mels"], + out_channels=c.audio["num_mels"], decoder_output_dim=c.audio["num_mels"], attn_type=c.attention_type, attn_win=c.windowing, diff --git a/notebooks/PlotUmapLibriTTS.ipynb b/notebooks/PlotUmapLibriTTS.ipynb index 97f9800d..0448f3df 100644 --- a/notebooks/PlotUmapLibriTTS.ipynb +++ b/notebooks/PlotUmapLibriTTS.ipynb @@ -33,7 +33,9 @@ "from TTS.utils.audio import AudioProcessor\n", "from TTS.utils.io import load_config\n", ======= - "from TTS.tts.utils.audio import AudioProcessor\n", + "from TTS.utils.audio import AudioProcessor + +\n", "from TTS.tts.utils.generic_utils import load_config\n", >>>>>>> dev "\n", diff --git a/notebooks/TestAttention.ipynb b/notebooks/TestAttention.ipynb index ed1c245b..5d8eed85 100644 --- a/notebooks/TestAttention.ipynb +++ b/notebooks/TestAttention.ipynb @@ -37,7 +37,9 @@ "import librosa.display\n", "\n", "from TTS.tts.layers import *\n", - "from TTS.tts.utils.audio import AudioProcessor\n", + "from TTS.utils.audio import AudioProcessor + +\n", "from TTS.tts.utils.generic_utils import setup_model\n", "from TTS.tts.utils.io import load_config\n", "from TTS.tts.utils.text import text_to_sequence\n", From 8c74f054f0e2b1558f55f4d5b829e311b7a7fe0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 14:57:05 +0200 Subject: [PATCH 211/258] =?UTF-8?q?Enable=20support=20for=20=F0=9F=90=8D?= =?UTF-8?q?=20python=203.10?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bump up versions numpy 1.19.5 and TF 2.5.0 --- TTS/tts/datasets/__init__.py | 4 ---- pyproject.toml | 2 +- requirements.tf.txt | 2 +- requirements.txt | 2 +- setup.py | 5 ++--- 5 files changed, 5 insertions(+), 10 deletions(-) diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index bcdbf6a6..cbae78a7 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -7,10 +7,6 @@ import numpy as np from TTS.tts.datasets.formatters import * from TTS.tts.datasets.TTSDataset import TTSDataset -#################### -# UTILITIES -#################### - def split_dataset(items): speakers = [item[-1] for item in items] diff --git a/pyproject.toml b/pyproject.toml index feaf5fd4..0941a906 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools", "wheel", "Cython", "numpy==1.18.5"] +requires = ["setuptools", "wheel", "Cython", "numpy==1.19.5"] [flake8] max-line-length=120 diff --git a/requirements.tf.txt b/requirements.tf.txt index 60f6e6c9..8e256a90 100644 --- a/requirements.tf.txt +++ b/requirements.tf.txt @@ -1 +1 @@ -tensorflow==2.3.1 +tensorflow==2.5.0 diff --git a/requirements.txt b/requirements.txt index 046139d0..7437b78a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ inflect jieba librosa==0.8.0 matplotlib -numpy==1.18.5 +numpy==1.19.5 pandas pypinyin pysbd diff --git a/setup.py b/setup.py index 7cfb6519..b4015455 100644 --- a/setup.py +++ b/setup.py @@ -11,9 +11,8 @@ import setuptools.command.develop from Cython.Build import cythonize from setuptools import Extension, find_packages, setup - if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) > LooseVersion("3.9"): - raise RuntimeError("TTS requires python >= 3.6 and <3.9 " "but your Python version is {}".format(sys.version)) + raise RuntimeError("TTS requires python >= 3.6 and <=3.10 " "but your Python version is {}".format(sys.version)) cwd = os.path.dirname(os.path.abspath(__file__)) @@ -99,7 +98,7 @@ setup( "notebooks": requirements_notebooks, "tf": requirements_tf, }, - python_requires=">=3.6.0, <3.9", + python_requires=">=3.6.0, <3.10", entry_points={"console_scripts": ["tts=TTS.bin.synthesize:main", "tts-server = TTS.server.server:main"]}, classifiers=[ "Programming Language :: Python", From a196007062f49c40ed728c40e0bf774dc6eb5438 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 15:01:39 +0200 Subject: [PATCH 212/258] Update Pylint configuration --- .pylintrc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.pylintrc b/.pylintrc index 34c121eb..7293f5ad 100644 --- a/.pylintrc +++ b/.pylintrc @@ -61,6 +61,9 @@ confidence= # no Warning level messages displayed, use "--disable=all --enable=classes # --disable=W". disable=missing-docstring, + too-many-public-methods, + too-many-lines, + bare-except, line-too-long, fixme, wrong-import-order, From 0636c91919612cfd5b757d3691e49f684e10a342 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 15:02:02 +0200 Subject: [PATCH 213/258] Update gitignore --- .gitignore | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 7ca905ff..c4647723 100644 --- a/.gitignore +++ b/.gitignore @@ -124,7 +124,9 @@ version.py # jupyter dummy files core +# files used internally fro dev, test etc. tests/outputs/* +tests/train_outputs/* TODO.txt .vscode/* data/* @@ -132,7 +134,21 @@ notebooks/data/* TTS/tts/layers/glow_tts/monotonic_align/core.c .vscode-upload.json temp_build/* -recipes/* - -# nohup logs +recipes/WIP/* +recipes/ljspeech/LJSpeech-1.1/* +events.out* +old_configs/* +model_importers/* +model_profiling/* +docs/* +.noseids +.dccache +log.txt +umap.png *.out +SocialMedia.txt +output.wav +tts_output.wav +deps.json +speakers.json +internal/* \ No newline at end of file From 418c7d98d526934fd73505472774a70e98bd2cc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 15:09:40 +0200 Subject: [PATCH 214/258] Create LJSpeech recipes for all the models --- recipes/ljspeech/README.md | 19 ++++ recipes/ljspeech/download_ljspeech.sh | 14 +++ recipes/ljspeech/glow_tts/train_glowtts.py | 30 +++++++ recipes/ljspeech/hifigan/train_hifigan.py | 30 +++++++ .../train_multiband_melgan.py | 30 +++++++ recipes/ljspeech/tacotron2-DCA/run.sh | 22 +++++ .../ljspeech/tacotron2-DCA/scale_stats.npy | Bin 0 -> 10700 bytes .../ljspeech/tacotron2-DCA/tacotron2-DCA.json | 85 ++++++++++++++++++ .../ljspeech/tacotron2-DDC/scale_stats.npy | Bin 0 -> 10700 bytes recipes/ljspeech/wavegrad/train_wavegrad.py | 29 ++++++ recipes/ljspeech/wavernn/train_wavernn.py | 30 +++++++ 11 files changed, 289 insertions(+) create mode 100644 recipes/ljspeech/README.md create mode 100644 recipes/ljspeech/download_ljspeech.sh create mode 100644 recipes/ljspeech/glow_tts/train_glowtts.py create mode 100644 recipes/ljspeech/hifigan/train_hifigan.py create mode 100644 recipes/ljspeech/multiband_melgan/train_multiband_melgan.py create mode 100644 recipes/ljspeech/tacotron2-DCA/run.sh create mode 100644 recipes/ljspeech/tacotron2-DCA/scale_stats.npy create mode 100644 recipes/ljspeech/tacotron2-DCA/tacotron2-DCA.json create mode 100644 recipes/ljspeech/tacotron2-DDC/scale_stats.npy create mode 100644 recipes/ljspeech/wavegrad/train_wavegrad.py create mode 100644 recipes/ljspeech/wavernn/train_wavernn.py diff --git a/recipes/ljspeech/README.md b/recipes/ljspeech/README.md new file mode 100644 index 00000000..94508a7f --- /dev/null +++ b/recipes/ljspeech/README.md @@ -0,0 +1,19 @@ +# 🐸💬 TTS LJspeech Recipes + +For running the recipes + +1. Download the LJSpeech dataset here either manually from [its official website](https://keithito.com/LJ-Speech-Dataset/) or using ```download_ljspeech.sh```. +2. Go to your desired model folder and run the training. + + Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```) + ```terminal + CUDA_VISIBLE_DEVICES="0" python train_modelX.py + ``` + + Running bash scripts. + ```terminal + bash run.sh + ``` + +💡 Note that these runs are just templates to help you start training your first model. They are not optimized for the best +result. Double-check the configurations and feel free to share your experiments to find better parameters together 💪. diff --git a/recipes/ljspeech/download_ljspeech.sh b/recipes/ljspeech/download_ljspeech.sh new file mode 100644 index 00000000..14ef058d --- /dev/null +++ b/recipes/ljspeech/download_ljspeech.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# take the scripts's parent's directory to prefix all the output paths. +RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +echo $RUN_DIR +# download LJSpeech dataset +wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 +# extract +tar -xjf LJSpeech-1.1.tar.bz2 +# create train-val splits +shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv +head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv +tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv +mv LJSpeech-1.1 $RUN_DIR/ +rm LJSpeech-1.1.tar.bz2 \ No newline at end of file diff --git a/recipes/ljspeech/glow_tts/train_glowtts.py b/recipes/ljspeech/glow_tts/train_glowtts.py new file mode 100644 index 00000000..0a3c3838 --- /dev/null +++ b/recipes/ljspeech/glow_tts/train_glowtts.py @@ -0,0 +1,30 @@ +import os + +from TTS.tts.configs import GlowTTSConfig +from TTS.tts.configs import BaseDatasetConfig +from TTS.trainer import init_training, Trainer, TrainingArgs + + +output_path = os.path.dirname(os.path.abspath(__file__)) +dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")) +config = GlowTTSConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + text_cleaner="english_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + print_step=25, + print_eval=True, + mixed_precision=False, + output_path=output_path, + datasets=[dataset_config] +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger) +trainer.fit() diff --git a/recipes/ljspeech/hifigan/train_hifigan.py b/recipes/ljspeech/hifigan/train_hifigan.py new file mode 100644 index 00000000..99b39e99 --- /dev/null +++ b/recipes/ljspeech/hifigan/train_hifigan.py @@ -0,0 +1,30 @@ +import os + +from TTS.vocoder.configs import HifiganConfig +from TTS.trainer import init_training, Trainer, TrainingArgs + + +output_path = os.path.dirname(os.path.abspath(__file__)) +config = HifiganConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + seq_len=8192, + pad_short=2000, + use_noise_augment=True, + eval_split_size=10, + print_step=25, + print_eval=True, + mixed_precision=False, + lr_gen=1e-4, + lr_disc=1e-4, + data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), + output_path=output_path, +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger) +trainer.fit() diff --git a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py new file mode 100644 index 00000000..6b766ab7 --- /dev/null +++ b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py @@ -0,0 +1,30 @@ +import os + +from TTS.vocoder.configs import MultibandMelganConfig +from TTS.trainer import init_training, Trainer, TrainingArgs + + +output_path = os.path.dirname(os.path.abspath(__file__)) +config = MultibandMelganConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + seq_len=8192, + pad_short=2000, + use_noise_augment=True, + eval_split_size=10, + print_step=25, + print_eval=True, + mixed_precision=False, + lr_gen=1e-4, + lr_disc=1e-4, + data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), + output_path=output_path, +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger) +trainer.fit() diff --git a/recipes/ljspeech/tacotron2-DCA/run.sh b/recipes/ljspeech/tacotron2-DCA/run.sh new file mode 100644 index 00000000..8bcd9e3d --- /dev/null +++ b/recipes/ljspeech/tacotron2-DCA/run.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# take the scripts's parent's directory to prefix all the output paths. +RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +echo $RUN_DIR +# # download LJSpeech dataset +# wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 +# # extract +# tar -xjf LJSpeech-1.1.tar.bz2 +# # create train-val splits +# shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv +# head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv +# tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv +# mv LJSpeech-1.1 $RUN_DIR/ +# rm LJSpeech-1.1.tar.bz2 +# # compute dataset mean and variance for normalization +# python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/LJSpeech-1.1/wavs/ +# training .... +# change the GPU id if needed +CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path $RUN_DIR/tacotron2-DCA.json \ + --coqpit.output_path $RUN_DIR \ + --coqpit.datasets.0.path /media/erogol/nvme_linux/gdrive/Projects/TTS/recipes/ljspeech/tacotron2-DDC/LJSpeech-1.1/ \ + --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ \ No newline at end of file diff --git a/recipes/ljspeech/tacotron2-DCA/scale_stats.npy b/recipes/ljspeech/tacotron2-DCA/scale_stats.npy new file mode 100644 index 0000000000000000000000000000000000000000..1dc577a68253f87470e66444db4f19e583dc7adb GIT binary patch literal 10700 zcmbulc|4Te8$Ukwy^TUDyHb`UmB^)(lr<@9$ugF*jV+2oN<>MEM?yqNR7y&vbEl1z zrAUQrm6>5?gu!5DzUQgW^ZmYlfB)u>xnAcw*Ltq&oO|Ydp9^+-Y#m$$NYSLD%YDfa zKHC6ik0*XC;NnkL_~&1`9w-mNPndyLwx_vQUt6?Cnq@sbvmd9>Q8;?RxC&8%cWqNT%$moO|y z7=)6Bml+8Wi2*-eH69frh6P!@4pbpRs(VO9Kwp@6w*010vyU*5K5m@++(DSQy!hFz zI6q;+ShGd5?UNAECz&=&6bKRF??3EzJS;?b`NYK2b_fwNif*hY>x76WpG6{=QbNQL z+m?e9N5}G%zoDf=c0$K4HRgxm59+Zbeazg=O(KMl!npFX zaS_7XX}`v(uPAX}>*bnNYs84FEuH^Rb;JoN>6*A3qvC{!n&`3kcnPBOX>0G7YZ8Qq z$L3+7L`?=_XQqYSKw{EYdIY(xi*Eb@6h0hn?{a& zG8G19+Akicr9#n_M*da?70$HlHeczX!sV{)l%hx~oJxK_#!sO_sfv-#<_;=M-50)A z_<;(P!jmO&m@hkT-nn86Do9tnezPNl3O(o7Ui25F!FE|`%f!1>5Lofu)BZgb6wmc+ z{V+>~eM;5aUeKvv*#1VK=`ML8tD4L3lhp zFRM+S3Mr@EyjjLnC|OhOdDRK~?<ntd*=vI^ zxu=y>RXqrgB2rej@dhEK$a%HFVk#t0TPd&JM}@mCm&7DLQ^DY$qWc@HXmE*oj%6H5 zgW$ODvGjv9*gMmDSUHadQrB(Ex60AsmA_()_f2QuD-Zf#(|_P$%Ai60o?J(GH49h(8;OIqHo z6lVfOIg+B{?IMMnHYhNJ>=pj^cvk(dan=59Ty;l^25wwi0l||L&3`*rd>)B(cs4+7 zGE9Il<7(+%mnRXncPyn>Zl6KvJ5usSou^QSP{h=!eQabE_2Y^wX#}O28mPD#(9nV8 zfRFaN1E_xgdew_YUFew(cSY;v&uHasxBl1jKcnw`+@a@J+L3_Is_!I+&nV{INMlWM z2O5-=^>ppsTT#&puHoD(pAhm&7jW9rj@-YT`5YY9fwRoFj1qUDDS)#3oVro%l{b7LU9Mg z&V{tH5V`Wn8xuYYT@ZYHW!J$G)NowEoZyTg`={HARvr9}-p$c5Eq(MGUGa%}u~~i; z?btLLlRIw|l?TS|drKKbE-kLBzsZgv1;ZOA_npR2nTon{RoNK2{>w|Wk~@aNV#<3= zgvU{E_E_2+y>Vok=seeP-#8L|x!=+wbsXuwqg0v~kE1(Q6V3}V#?f_C*%Zg9an!RQ zfZt^>j3@wZsLpM!VtJ>O(q0QJv0!lz^*0*ak)CPH_>EK@xLP8Ozfqi+zuR2aZxqw`OGS4Mwl`sg z?yMa}y@q--Az`D)wlJO1*))pSa#z<_tr|m3nR!#sc8sBk*_xQ_%rPVmucZwK$B^yC z=ly4O#!Jjy6C4pX`lK7$+Goy2p}lXUbC0r7 zoWiFlp)@=eJ~Y>%ij69IZgk|dvC#o<(Ea|4jiR0g`LF)QMh`^v6FR%th;-RIVeLmY zI#s89prDS8((Qe>_r7AI{J!5O6)M>1xuxLwm5EY2Ao%WTyUo?2g?y5-D4k zK506RHkRB|wVWD5zcWZ$s$FA9-R^K^Z08ti@ywaxw2vX~s}{au(->0ij$5{>e+=md zpq^tLW2h)=NY(et7%J2znf>}bhIBo7Pn2-q)wdj^md+nX(9)~of!AB>m4|(-+c+|) z3f6u{9!I%m*|#@b7)NWHQ$*Dtk0VVJi$YeNT%a@s(^eSM_7_PCUNHnDLS~U?YL~=c_V| z*obm2y7$9cHfsNV`Q_LeHrlCNGmmP*M%QG6=WoUSUx{tFz0eBxpXNoHod+8^<>)V& zCgVPeJJkOG#~c0A%_kXCdH+tGO~p9A9Rd|5_t~gp_ZwzNJsWL)S|2CZ&qe{s-O^Pf zY_vCSH1Vy>1Tyt%DA}Yqfr`q7YH1rM(4Ht?e%oeDr~US>_MAY~=Ui`u`%R#|%(3b3 z2@~k%zL1hHrzX&@BG)3@s}m@!(o!ws#sqR&DDhC_A(qp+`t#1y3H0IpOqWK(1WI~g zWu(_TfjEw<49`&~P^jJs-x%%$dTQ0VXpzJuI(PVWl+3b8v~S0zX7{C&$T?7GZn^0s zvUo0cKH6dui55;RJK!;i99v&?%X&_ty9SbDZAT|jyuE+%jkrlvT6^cc%-Knl>b^6| z@8Tp{ym8sC=8{Pidf{sDy2?qE@Xqhm?R%4`M=SAs@|#K2`K{kXfS5#s%WlSLcTA$L zT_*pOHccW%+%7qXL98bM>s&uiqOqQ(Vh2Vhk?DKI<3A=Qk!$ZzN0RImDmy;Ym#RF4 zcw@3Y{iajs?W=t4lKoR?tS!vY{lpadC2OH}vU&Kvr~aQJPl8wc6;@AW)@|F#rf3TPau;UH&8In{_C9Q0C6WJ%VlX=J{=@t}m$G&0pQ zx=y|}jndaz27GIoMs|l??zhO#pscUQBF8OfP|Jps0!z|nP{#ad_fld8&HD8hT%Dak zgluQGkl8F+n)Hql6F7?^585ms-JV6~Jd10q8)uP{wUL0;94>NP=N44Env1rj-Ex`@ z;387U#i^bnT$Fl@?`l!NMHV}go0e2@kx1-WL(vW{dP2`=Upm4?1(VFTb@O;gcV(@# zkTwr(@Ki;k+j;1~JeyqZVIJzpI)6I1kcW1--|FAf#zRk|NJ-N(JXAkD?D$ogkD|Js zcRVrUqX)~T_9b}n(XF~2fiDm7k!!Av&9_)S%9%-`GfMpk9HvYE9_Ai9mp0eR1 zWtCyew_1FZNz!l2GROSFh1OQae8ju`Ooy|Ij|^{`JzIrowj${teMvs5&ddB%F2qN3 zd@eTNfSVA-%6L9a2_2 zR4U`#)T+iq&5`N6VG$l$LyBC=?%^WIgntY|UU1Q{hi2pAVlGO{GJQKi=At`&Hrtb( zxJWcPCiT)vF0$S7X=j!I7cH&*DXR2)77@lXGpVh!=)x|8I*H0zo6PIVU1a*#y@nn5E+S?U(`GwAzAX5-ukGiYP|`@Y1pGpION zv_&{}26d*uxm#{Ch%+;)E4Y3J8D|*2{et^4RL*j>D`^I)7?*~McTb}VhDdko>uHo* z5$?aRbQ)bUc{TXy(ll~fv^3~&{50Bj>8dw3U>bcH&{N&!JdHAZ=3Csdm`0#}%g3(TUsLolOg;k+8uOFM7^2TJ`dpVKt9~ath+(uFY`J&7VvkJ0=Hl-K);% z406yYT9~u0hl4x?51%dSUc} z=R1~DUR0h{i05l9Z|TMiaL`zdamD9v9OO3oV#DRtWEThhqW#D^fcYP8g!^lJ z|LnZVLCp_srYZT@Py5@AE!Q|`EoYr@ zYcU52RK0+?7zSA$aAnV;XB-qiepxI5`@6m2dyO%U>!2LFaT|++WWFreSSvV zpe&w7)A?Ob%~nsNV{)CwH}Lr~H-1-x+o5T62IAK~%bG^~RU`6^mD9+hXw`<=o_(C` z7|lf|;p|9C5*MwteV2ar8isA$=06j9#6{|cbDm8$a1k;7)-8y@mH`Y7A1lZO;s7Qu_(JapZB!>(`%K3WsHy$Si+1;)UOFGU+2fpX=&Lr?rr|eO| z_2;qPUEOx$Qa<9QUNaqfz(*F}em@QWfbAMNm#Vb$k&~jdiV>BMtVVpcOpftU?5=yY z-XbJo`^tDZDQObXt#e~yEB;?sdo*IcrUr>HPf5ww)*%teZNaoUQxb7oJ|5V#B-f8bY!-SoZ_t-Si08n=ZcJtMtX6S7 zNJNQ}$s)D=BqCJ)!Ox{`BqIF!mRU7-5+TE@opWb5iAX3`PrZq0bxnfKVjNeL<(AP{ zOw;RwZJyxxd@q>!j9_ZuxcY_**0bq1o!;$9A_}c;CrkyCh*ZZfo2p|;gjp5m_<;-( zK_l1rMHgUuB6l`Etio~R7-_9Sg6Hq_<|6709p zc0NOb#4zoN{vr&+b*BjI{Coyer66s)pF+gSWe3!^Y!fD`@-OXoD;FlJc@`I%7K;!i zw#W0%WQq_vQ1i}TM3ku5>27P1EK108hV0vf#0bl^hV64w#0a?@wW-&0#RT_P+qaN=M47{{RG? z{{Vz^?OjEsi+{m>ZS4iyYI@;@?C^f7S~m!Mujpzx)(H*KXJ7e>euZ7)5=kp*kYtwK2f>aB##>e$^U>Z+Eh|SO;KEnR@O*$P+TKTGZ{xtZ~uA^s;p{}*) z)tTLfRQR11ws*HF6*i^waxUA@fl9WcnRd`&Nm*WRsmu^e8B5=Dn52W(!^;cn4ToUH zSK(&fE*j*IMbp^}Xi&aeV@XU} z`qjT{+G72U_AeDS(V*7W^V7gW8Wi0P?mX~=3OWZ23@?7B!Mj$e#w*`xa8O{w;yYhy zVC5t8P<$H=Dpa&}7iQAn#81`C4;?h<+R}gXLmdrTz1!JBeKhcN*n4VuB@I5!_8)U9 z$I#W1Bj>XWXmEG(>`2Q}8gPkxXR|Fduvs&F9({|=ce+-p@+i_*AFT;JXY=I((Dr%i*$-QBx>3DMxj*6A&|7-~DE$6VD` zMujw$lg_g_RM^lGo@XycgH@4_LQ2?FSTnWA;P)7Y>SPKYH)6Y5{k7ie<}|o^zM)DT zLuzNvC=~T$KY~7Q#lPctT)N1EXK@_s?LVpR4x+)@#OYk21YD0kouh(y{cE&#`)MW7 zpyk*@&1*3<@VW4uWSNEY`nq1ADV_!i{wf)oM`$4CvHFC|P8zU!75W$8xIfFpyfVUZ z4JaN6G3=zm(S}L77={jiw2nVG{Bn*k50>;@NuKUF`Z+FEd8i(Vyy6RJEq*aL~hG| zDy+YDBz%Pz6&Rr_4)!0WLfbx>kg`+^)hg^WMXn=lXO=x|DA0 zgZInMN&8`A6FNNa+MLwBgbry{$VdXrX?}RM7rqoNMQ&!?SpnQ51%`+@!OtYs9e{y zVZQ>dgL=E=jvOT#j1@`-S6k9x59wjo?LZpXJrDPMO`(BFP4TzNMC|{>ORcb28l=A1 z*2%&B8AXnB@xgV|8-L-C3TPl1a@u1zroknwt?R33;NQJPMLQ4g|KXyyYbx=%P*9ri z3eT5h85{N&;r?}V4ZQZ021cX*IPJyf&qos<*K>Hhk{@(1;xP@joXop&r3If$?iX&d z+Gudr{I$v@Cf*-OLN|(LXkdS>z&1se4o?S$6_l0epmgF^_phr-Q{t?c1h_bkH)&`;u@H*F)y&@Ei?)0-Zyj?Pp=Vff>^;W$r@ zeD1>INAa1w7AhTB}`qbieLm(61aBzXe5UhJFzI=`45S&ZP*?Ql72sU?U zJXP@@f;}z}n0&ObW@6_>0+>Pm;eoALDvvtkI`lp;58Xc~e!Ip~4gM{LKl zI@yFZ1V&FKUwMxX!J(Z|a9(B@3Z7g}mQ@~xmy7Q11N~t*S*?^9VLJ=~x%)JZdJIF3 z&cormM}|RD_2s4$hlU~E;KiUy>M#V}&trW$H4LF~UMtq$9|qa$)t*y#hCxk@9i@$5 zDIoLvk9xZCs|7Jl;ii#ckZ3Ny_fUWV*PGIArYbVv(bYD2vH=4`#ZQQDv}C~Ow>4o~ z$P8#nNSbg=WPsy@>h0Ax86Z@V_vv#Z1KRdWjkr%S;6%3Hi(y43EHhg0h_GjZUha;f zw@yqLcS^il9>WAXxny>~HxtZCy%xQVXM*et?)CI=ChQC{VQ)Fj1n!(3|1)Qp&^{I{ z!_8qrpK@KM>wPBBr)J(Nb}`}c=9;EEb6L>#57^4Xsz+6X*6x>|#;KLYQ` zS{@xMM?m_0(fd8CMj*{+vGzSo`ENTuQL+3bkq;V7?Gb3ZVtDR_`Un_DG|w*58iC?U z@rXUDBk*cf*`%KG2%vM^Vr#h(*wCJ3A&UJ`KR$>`k{N+#mJeNQF#qWs$#oNgBQW~r zWqD>d3nY%7WRsXIxO6FCiRu&!(hfa-UfRckkSc#Wr69n zC&#Z*SYWlLtKpG13p~zSi?2&$!Iv#37w4w1z$) z$H*61Q1W(Bu~rf0^VPWOH&~!o9P=#Y77NNBY@U|Kbv|}FwCYPHh8M6vyx42tHui6m+7>0B z!GcqgrsTe47Q|KT>~1}R$eLUJ#g7GS<7R&1E*8|v5&B>4 zu-v<-!%DkZ@J&wgR}U9cnFN0<}&0{760%j(iHp+jy3VUnwqXP{sR>$S@`^NMb^IeR$8NFedm`en^Hu zCj1`wAj;dt1o2yH4l&kD=<u$44GPRi?UGu^qL755P%C6^s#jhmI=gVbQ z$S`4FeZYYaGYqh2JpHzW!vN`?rf*tw28gFP7IpP8ARv8^wWX5*4r9eN(H|MGB6t4R z?f4wwzOt`aS;v3{&-4dwzhS`Tm~Xq?9^vz7xrSxUV+IhaDs&lw0iRM!4~{;^dTyJd zufJo!)BKcZCLSMhEh|dIb{=PKE|{!kz_xor+cskU;=(ks&-WNmQGcb!@Gb)=C%1pR zTg-so?Bf-=*$l8?i(To=#OK&X*|p&@3~-;W)#YH?lDGMHwQ^T-E?j2A1$1w1o7g`xN55sZmXH4CK^7lz~_C%swIm|74R!& znS#Au^BAyeefwp5EFb49z+8asOr*x$m^YsRWr9InF0u@GD>y1zkNwViAiu=_)K=yex zjzgg`SEvd1+wZlaOPh)q(Auy~C+jkPe%d_E&A@r6ZPhJ_KEr^8T$x9eNesBd6f5Cj z`bvqemy7cj7OXU}4Z!iZuYlFSRL*g z@h{cN_;V@EriVoCwc{BK=U}GiP7) zKfQ3o%JHdV@OMykT6i^1r3?SR;LfqI>;sB!Bt_5LMH&MLUQxb*p`Q4PnqQzlML(Xh z`tPkVKfg%Nh`=Lcih(Uj_z%)>I55OBh#cY{89-U{A1@#@>_48NElI#-4mRi)?iEb- zj0gzC7J?%vYwcX-{$K82{eRg;b}llQ9TAD8!n}MvgF}7Ely!e@@`RK9g2+CRl=XkE zNJMxAhXs*6!@VNOlnu6Od;i>N3n%|&Q;bvp2!(}{$-!X(UJ-#2l#Sc5-9Mi{`C)4~ z#Atj8&exk_VlDT_t#7DjL}*lquV+YTcre8@^-qYH85tfJjMK(p`j9D`PX7t#4-e~_ zS)00uV7pjy@X!y_o>Sp47XclcHyzTXiPLD_8W z@PF^)`2`1tP_}G0-|;^a|J~E`3-*ekSZp`1z4vFrMFNwEFtU%QzZX{6YAx`0Ndo;t zyn_D5zU}m%EU~@+m!;*-tN1?PpJGJB1_zUI20nkKx1atSf>+GnvDJ2Se68f)&mUJl zL4jd^Cw83vQ}c*OugC~oA6)aD7`P*m^n=O1fnNIL@KFEIApMYNTxooVB`QYW-#0uk znyl{-9(oYhE<)eQX}3PU`4|{Rj?l;dal*)Cp8)+xFQ3rJ@X(Mox?8te=tuZ?;lloL Tp%)SsORMEM?yqNR7y&vbEl1z zrAUQrm6>5?gu!5DzUQgW^ZmYlfB)u>xnAcw*Ltq&oO|Ydp9^+-Y#m$$NYSLD%YDfa zKHC6ik0*XC;NnkL_~&1`9w-mNPndyLwx_vQUt6?Cnq@sbvmd9>Q8;?RxC&8%cWqNT%$moO|y z7=)6Bml+8Wi2*-eH69frh6P!@4pbpRs(VO9Kwp@6w*010vyU*5K5m@++(DSQy!hFz zI6q;+ShGd5?UNAECz&=&6bKRF??3EzJS;?b`NYK2b_fwNif*hY>x76WpG6{=QbNQL z+m?e9N5}G%zoDf=c0$K4HRgxm59+Zbeazg=O(KMl!npFX zaS_7XX}`v(uPAX}>*bnNYs84FEuH^Rb;JoN>6*A3qvC{!n&`3kcnPBOX>0G7YZ8Qq z$L3+7L`?=_XQqYSKw{EYdIY(xi*Eb@6h0hn?{a& zG8G19+Akicr9#n_M*da?70$HlHeczX!sV{)l%hx~oJxK_#!sO_sfv-#<_;=M-50)A z_<;(P!jmO&m@hkT-nn86Do9tnezPNl3O(o7Ui25F!FE|`%f!1>5Lofu)BZgb6wmc+ z{V+>~eM;5aUeKvv*#1VK=`ML8tD4L3lhp zFRM+S3Mr@EyjjLnC|OhOdDRK~?<ntd*=vI^ zxu=y>RXqrgB2rej@dhEK$a%HFVk#t0TPd&JM}@mCm&7DLQ^DY$qWc@HXmE*oj%6H5 zgW$ODvGjv9*gMmDSUHadQrB(Ex60AsmA_()_f2QuD-Zf#(|_P$%Ai60o?J(GH49h(8;OIqHo z6lVfOIg+B{?IMMnHYhNJ>=pj^cvk(dan=59Ty;l^25wwi0l||L&3`*rd>)B(cs4+7 zGE9Il<7(+%mnRXncPyn>Zl6KvJ5usSou^QSP{h=!eQabE_2Y^wX#}O28mPD#(9nV8 zfRFaN1E_xgdew_YUFew(cSY;v&uHasxBl1jKcnw`+@a@J+L3_Is_!I+&nV{INMlWM z2O5-=^>ppsTT#&puHoD(pAhm&7jW9rj@-YT`5YY9fwRoFj1qUDDS)#3oVro%l{b7LU9Mg z&V{tH5V`Wn8xuYYT@ZYHW!J$G)NowEoZyTg`={HARvr9}-p$c5Eq(MGUGa%}u~~i; z?btLLlRIw|l?TS|drKKbE-kLBzsZgv1;ZOA_npR2nTon{RoNK2{>w|Wk~@aNV#<3= zgvU{E_E_2+y>Vok=seeP-#8L|x!=+wbsXuwqg0v~kE1(Q6V3}V#?f_C*%Zg9an!RQ zfZt^>j3@wZsLpM!VtJ>O(q0QJv0!lz^*0*ak)CPH_>EK@xLP8Ozfqi+zuR2aZxqw`OGS4Mwl`sg z?yMa}y@q--Az`D)wlJO1*))pSa#z<_tr|m3nR!#sc8sBk*_xQ_%rPVmucZwK$B^yC z=ly4O#!Jjy6C4pX`lK7$+Goy2p}lXUbC0r7 zoWiFlp)@=eJ~Y>%ij69IZgk|dvC#o<(Ea|4jiR0g`LF)QMh`^v6FR%th;-RIVeLmY zI#s89prDS8((Qe>_r7AI{J!5O6)M>1xuxLwm5EY2Ao%WTyUo?2g?y5-D4k zK506RHkRB|wVWD5zcWZ$s$FA9-R^K^Z08ti@ywaxw2vX~s}{au(->0ij$5{>e+=md zpq^tLW2h)=NY(et7%J2znf>}bhIBo7Pn2-q)wdj^md+nX(9)~of!AB>m4|(-+c+|) z3f6u{9!I%m*|#@b7)NWHQ$*Dtk0VVJi$YeNT%a@s(^eSM_7_PCUNHnDLS~U?YL~=c_V| z*obm2y7$9cHfsNV`Q_LeHrlCNGmmP*M%QG6=WoUSUx{tFz0eBxpXNoHod+8^<>)V& zCgVPeJJkOG#~c0A%_kXCdH+tGO~p9A9Rd|5_t~gp_ZwzNJsWL)S|2CZ&qe{s-O^Pf zY_vCSH1Vy>1Tyt%DA}Yqfr`q7YH1rM(4Ht?e%oeDr~US>_MAY~=Ui`u`%R#|%(3b3 z2@~k%zL1hHrzX&@BG)3@s}m@!(o!ws#sqR&DDhC_A(qp+`t#1y3H0IpOqWK(1WI~g zWu(_TfjEw<49`&~P^jJs-x%%$dTQ0VXpzJuI(PVWl+3b8v~S0zX7{C&$T?7GZn^0s zvUo0cKH6dui55;RJK!;i99v&?%X&_ty9SbDZAT|jyuE+%jkrlvT6^cc%-Knl>b^6| z@8Tp{ym8sC=8{Pidf{sDy2?qE@Xqhm?R%4`M=SAs@|#K2`K{kXfS5#s%WlSLcTA$L zT_*pOHccW%+%7qXL98bM>s&uiqOqQ(Vh2Vhk?DKI<3A=Qk!$ZzN0RImDmy;Ym#RF4 zcw@3Y{iajs?W=t4lKoR?tS!vY{lpadC2OH}vU&Kvr~aQJPl8wc6;@AW)@|F#rf3TPau;UH&8In{_C9Q0C6WJ%VlX=J{=@t}m$G&0pQ zx=y|}jndaz27GIoMs|l??zhO#pscUQBF8OfP|Jps0!z|nP{#ad_fld8&HD8hT%Dak zgluQGkl8F+n)Hql6F7?^585ms-JV6~Jd10q8)uP{wUL0;94>NP=N44Env1rj-Ex`@ z;387U#i^bnT$Fl@?`l!NMHV}go0e2@kx1-WL(vW{dP2`=Upm4?1(VFTb@O;gcV(@# zkTwr(@Ki;k+j;1~JeyqZVIJzpI)6I1kcW1--|FAf#zRk|NJ-N(JXAkD?D$ogkD|Js zcRVrUqX)~T_9b}n(XF~2fiDm7k!!Av&9_)S%9%-`GfMpk9HvYE9_Ai9mp0eR1 zWtCyew_1FZNz!l2GROSFh1OQae8ju`Ooy|Ij|^{`JzIrowj${teMvs5&ddB%F2qN3 zd@eTNfSVA-%6L9a2_2 zR4U`#)T+iq&5`N6VG$l$LyBC=?%^WIgntY|UU1Q{hi2pAVlGO{GJQKi=At`&Hrtb( zxJWcPCiT)vF0$S7X=j!I7cH&*DXR2)77@lXGpVh!=)x|8I*H0zo6PIVU1a*#y@nn5E+S?U(`GwAzAX5-ukGiYP|`@Y1pGpION zv_&{}26d*uxm#{Ch%+;)E4Y3J8D|*2{et^4RL*j>D`^I)7?*~McTb}VhDdko>uHo* z5$?aRbQ)bUc{TXy(ll~fv^3~&{50Bj>8dw3U>bcH&{N&!JdHAZ=3Csdm`0#}%g3(TUsLolOg;k+8uOFM7^2TJ`dpVKt9~ath+(uFY`J&7VvkJ0=Hl-K);% z406yYT9~u0hl4x?51%dSUc} z=R1~DUR0h{i05l9Z|TMiaL`zdamD9v9OO3oV#DRtWEThhqW#D^fcYP8g!^lJ z|LnZVLCp_srYZT@Py5@AE!Q|`EoYr@ zYcU52RK0+?7zSA$aAnV;XB-qiepxI5`@6m2dyO%U>!2LFaT|++WWFreSSvV zpe&w7)A?Ob%~nsNV{)CwH}Lr~H-1-x+o5T62IAK~%bG^~RU`6^mD9+hXw`<=o_(C` z7|lf|;p|9C5*MwteV2ar8isA$=06j9#6{|cbDm8$a1k;7)-8y@mH`Y7A1lZO;s7Qu_(JapZB!>(`%K3WsHy$Si+1;)UOFGU+2fpX=&Lr?rr|eO| z_2;qPUEOx$Qa<9QUNaqfz(*F}em@QWfbAMNm#Vb$k&~jdiV>BMtVVpcOpftU?5=yY z-XbJo`^tDZDQObXt#e~yEB;?sdo*IcrUr>HPf5ww)*%teZNaoUQxb7oJ|5V#B-f8bY!-SoZ_t-Si08n=ZcJtMtX6S7 zNJNQ}$s)D=BqCJ)!Ox{`BqIF!mRU7-5+TE@opWb5iAX3`PrZq0bxnfKVjNeL<(AP{ zOw;RwZJyxxd@q>!j9_ZuxcY_**0bq1o!;$9A_}c;CrkyCh*ZZfo2p|;gjp5m_<;-( zK_l1rMHgUuB6l`Etio~R7-_9Sg6Hq_<|6709p zc0NOb#4zoN{vr&+b*BjI{Coyer66s)pF+gSWe3!^Y!fD`@-OXoD;FlJc@`I%7K;!i zw#W0%WQq_vQ1i}TM3ku5>27P1EK108hV0vf#0bl^hV64w#0a?@wW-&0#RT_P+qaN=M47{{RG? z{{Vz^?OjEsi+{m>ZS4iyYI@;@?C^f7S~m!Mujpzx)(H*KXJ7e>euZ7)5=kp*kYtwK2f>aB##>e$^U>Z+Eh|SO;KEnR@O*$P+TKTGZ{xtZ~uA^s;p{}*) z)tTLfRQR11ws*HF6*i^waxUA@fl9WcnRd`&Nm*WRsmu^e8B5=Dn52W(!^;cn4ToUH zSK(&fE*j*IMbp^}Xi&aeV@XU} z`qjT{+G72U_AeDS(V*7W^V7gW8Wi0P?mX~=3OWZ23@?7B!Mj$e#w*`xa8O{w;yYhy zVC5t8P<$H=Dpa&}7iQAn#81`C4;?h<+R}gXLmdrTz1!JBeKhcN*n4VuB@I5!_8)U9 z$I#W1Bj>XWXmEG(>`2Q}8gPkxXR|Fduvs&F9({|=ce+-p@+i_*AFT;JXY=I((Dr%i*$-QBx>3DMxj*6A&|7-~DE$6VD` zMujw$lg_g_RM^lGo@XycgH@4_LQ2?FSTnWA;P)7Y>SPKYH)6Y5{k7ie<}|o^zM)DT zLuzNvC=~T$KY~7Q#lPctT)N1EXK@_s?LVpR4x+)@#OYk21YD0kouh(y{cE&#`)MW7 zpyk*@&1*3<@VW4uWSNEY`nq1ADV_!i{wf)oM`$4CvHFC|P8zU!75W$8xIfFpyfVUZ z4JaN6G3=zm(S}L77={jiw2nVG{Bn*k50>;@NuKUF`Z+FEd8i(Vyy6RJEq*aL~hG| zDy+YDBz%Pz6&Rr_4)!0WLfbx>kg`+^)hg^WMXn=lXO=x|DA0 zgZInMN&8`A6FNNa+MLwBgbry{$VdXrX?}RM7rqoNMQ&!?SpnQ51%`+@!OtYs9e{y zVZQ>dgL=E=jvOT#j1@`-S6k9x59wjo?LZpXJrDPMO`(BFP4TzNMC|{>ORcb28l=A1 z*2%&B8AXnB@xgV|8-L-C3TPl1a@u1zroknwt?R33;NQJPMLQ4g|KXyyYbx=%P*9ri z3eT5h85{N&;r?}V4ZQZ021cX*IPJyf&qos<*K>Hhk{@(1;xP@joXop&r3If$?iX&d z+Gudr{I$v@Cf*-OLN|(LXkdS>z&1se4o?S$6_l0epmgF^_phr-Q{t?c1h_bkH)&`;u@H*F)y&@Ei?)0-Zyj?Pp=Vff>^;W$r@ zeD1>INAa1w7AhTB}`qbieLm(61aBzXe5UhJFzI=`45S&ZP*?Ql72sU?U zJXP@@f;}z}n0&ObW@6_>0+>Pm;eoALDvvtkI`lp;58Xc~e!Ip~4gM{LKl zI@yFZ1V&FKUwMxX!J(Z|a9(B@3Z7g}mQ@~xmy7Q11N~t*S*?^9VLJ=~x%)JZdJIF3 z&cormM}|RD_2s4$hlU~E;KiUy>M#V}&trW$H4LF~UMtq$9|qa$)t*y#hCxk@9i@$5 zDIoLvk9xZCs|7Jl;ii#ckZ3Ny_fUWV*PGIArYbVv(bYD2vH=4`#ZQQDv}C~Ow>4o~ z$P8#nNSbg=WPsy@>h0Ax86Z@V_vv#Z1KRdWjkr%S;6%3Hi(y43EHhg0h_GjZUha;f zw@yqLcS^il9>WAXxny>~HxtZCy%xQVXM*et?)CI=ChQC{VQ)Fj1n!(3|1)Qp&^{I{ z!_8qrpK@KM>wPBBr)J(Nb}`}c=9;EEb6L>#57^4Xsz+6X*6x>|#;KLYQ` zS{@xMM?m_0(fd8CMj*{+vGzSo`ENTuQL+3bkq;V7?Gb3ZVtDR_`Un_DG|w*58iC?U z@rXUDBk*cf*`%KG2%vM^Vr#h(*wCJ3A&UJ`KR$>`k{N+#mJeNQF#qWs$#oNgBQW~r zWqD>d3nY%7WRsXIxO6FCiRu&!(hfa-UfRckkSc#Wr69n zC&#Z*SYWlLtKpG13p~zSi?2&$!Iv#37w4w1z$) z$H*61Q1W(Bu~rf0^VPWOH&~!o9P=#Y77NNBY@U|Kbv|}FwCYPHh8M6vyx42tHui6m+7>0B z!GcqgrsTe47Q|KT>~1}R$eLUJ#g7GS<7R&1E*8|v5&B>4 zu-v<-!%DkZ@J&wgR}U9cnFN0<}&0{760%j(iHp+jy3VUnwqXP{sR>$S@`^NMb^IeR$8NFedm`en^Hu zCj1`wAj;dt1o2yH4l&kD=<u$44GPRi?UGu^qL755P%C6^s#jhmI=gVbQ z$S`4FeZYYaGYqh2JpHzW!vN`?rf*tw28gFP7IpP8ARv8^wWX5*4r9eN(H|MGB6t4R z?f4wwzOt`aS;v3{&-4dwzhS`Tm~Xq?9^vz7xrSxUV+IhaDs&lw0iRM!4~{;^dTyJd zufJo!)BKcZCLSMhEh|dIb{=PKE|{!kz_xor+cskU;=(ks&-WNmQGcb!@Gb)=C%1pR zTg-so?Bf-=*$l8?i(To=#OK&X*|p&@3~-;W)#YH?lDGMHwQ^T-E?j2A1$1w1o7g`xN55sZmXH4CK^7lz~_C%swIm|74R!& znS#Au^BAyeefwp5EFb49z+8asOr*x$m^YsRWr9InF0u@GD>y1zkNwViAiu=_)K=yex zjzgg`SEvd1+wZlaOPh)q(Auy~C+jkPe%d_E&A@r6ZPhJ_KEr^8T$x9eNesBd6f5Cj z`bvqemy7cj7OXU}4Z!iZuYlFSRL*g z@h{cN_;V@EriVoCwc{BK=U}GiP7) zKfQ3o%JHdV@OMykT6i^1r3?SR;LfqI>;sB!Bt_5LMH&MLUQxb*p`Q4PnqQzlML(Xh z`tPkVKfg%Nh`=Lcih(Uj_z%)>I55OBh#cY{89-U{A1@#@>_48NElI#-4mRi)?iEb- zj0gzC7J?%vYwcX-{$K82{eRg;b}llQ9TAD8!n}MvgF}7Ely!e@@`RK9g2+CRl=XkE zNJMxAhXs*6!@VNOlnu6Od;i>N3n%|&Q;bvp2!(}{$-!X(UJ-#2l#Sc5-9Mi{`C)4~ z#Atj8&exk_VlDT_t#7DjL}*lquV+YTcre8@^-qYH85tfJjMK(p`j9D`PX7t#4-e~_ zS)00uV7pjy@X!y_o>Sp47XclcHyzTXiPLD_8W z@PF^)`2`1tP_}G0-|;^a|J~E`3-*ekSZp`1z4vFrMFNwEFtU%QzZX{6YAx`0Ndo;t zyn_D5zU}m%EU~@+m!;*-tN1?PpJGJB1_zUI20nkKx1atSf>+GnvDJ2Se68f)&mUJl zL4jd^Cw83vQ}c*OugC~oA6)aD7`P*m^n=O1fnNIL@KFEIApMYNTxooVB`QYW-#0uk znyl{-9(oYhE<)eQX}3PU`4|{Rj?l;dal*)Cp8)+xFQ3rJ@X(Mox?8te=tuZ?;lloL Tp%)SsOR Date: Mon, 21 Jun 2021 16:49:30 +0200 Subject: [PATCH 215/258] =?UTF-8?q?Fixup=20`trainer.py`=20=F0=9F=9B=A0?= =?UTF-8?q?=EF=B8=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TTS/trainer.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index 8b7be3d1..ec6d4417 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -462,12 +462,12 @@ class Trainer: update_lr_scheduler = True if self.use_amp_scaler: if self.use_apex: - with amp.scale_loss(loss_dict["loss"], self.optimizer) as scaled_loss: + with amp.scale_loss(loss_dict["loss"], optimizer) as scaled_loss: scaled_loss.backward() - grad_norm = torch.nn.utils.clip_grad_norm_( - amp.master_params(self.optimizer), - self.config.grad_clip, - ) + grad_norm = torch.nn.utils.clip_grad_norm_( + amp.master_params(optimizer), + grad_clip, + ) else: # model optimizer step in mixed precision mode scaler.scale(loss_dict["loss"]).backward() @@ -739,6 +739,7 @@ class Trainer: self.tb_logger.tb_eval_figures(self.total_steps_done, figures) if audios is not None: self.tb_logger.tb_eval_audios(self.total_steps_done, audios, self.ap.sample_rate) + self.tb_logger.tb_eval_stats(self.total_steps_done, self.keep_avg_eval.avg_values) def test_run(self) -> None: """Run test and log the results. Test run must be defined by the model. From d700845b109cc140ce4d50645b94d4eb19919a3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 21 Jun 2021 16:50:37 +0200 Subject: [PATCH 216/258] Move `TorchSTFT` to `utils.audio` --- TTS/utils/audio.py | 77 ++++++++++++++++++++++++++++++++++ TTS/vocoder/layers/losses.py | 80 +----------------------------------- 2 files changed, 79 insertions(+), 78 deletions(-) diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index 222b4c74..e1913e98 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -3,12 +3,89 @@ import numpy as np import scipy.io.wavfile import scipy.signal import soundfile as sf +import torch +from torch import nn from TTS.tts.utils.data import StandardScaler # import pyworld as pw +class TorchSTFT(nn.Module): # pylint: disable=abstract-method + """TODO: Merge this with audio.py""" + + def __init__( + self, + n_fft, + hop_length, + win_length, + pad_wav=False, + window="hann_window", + sample_rate=None, + mel_fmin=0, + mel_fmax=None, + n_mels=80, + use_mel=False, + ): + super().__init__() + self.n_fft = n_fft + self.hop_length = hop_length + self.win_length = win_length + self.pad_wav = pad_wav + self.sample_rate = sample_rate + self.mel_fmin = mel_fmin + self.mel_fmax = mel_fmax + self.n_mels = n_mels + self.use_mel = use_mel + self.window = nn.Parameter(getattr(torch, window)(win_length), requires_grad=False) + self.mel_basis = None + if use_mel: + self._build_mel_basis() + + def __call__(self, x): + """Compute spectrogram frames by torch based stft. + + Args: + x (Tensor): input waveform + + Returns: + Tensor: spectrogram frames. + + Shapes: + x: [B x T] or [B x 1 x T] + """ + if x.ndim == 2: + x = x.unsqueeze(1) + if self.pad_wav: + padding = int((self.n_fft - self.hop_length) / 2) + x = torch.nn.functional.pad(x, (padding, padding), mode="reflect") + # B x D x T x 2 + o = torch.stft( + x.squeeze(1), + self.n_fft, + self.hop_length, + self.win_length, + self.window, + center=True, + pad_mode="reflect", # compatible with audio.py + normalized=False, + onesided=True, + return_complex=False, + ) + M = o[:, :, :, 0] + P = o[:, :, :, 1] + S = torch.sqrt(torch.clamp(M ** 2 + P ** 2, min=1e-8)) + if self.use_mel: + S = torch.matmul(self.mel_basis.to(x), S) + return S + + def _build_mel_basis(self): + mel_basis = librosa.filters.mel( + self.sample_rate, self.n_fft, n_mels=self.n_mels, fmin=self.mel_fmin, fmax=self.mel_fmax + ) + self.mel_basis = torch.from_numpy(mel_basis).float() + + # pylint: disable=too-many-public-methods class AudioProcessor(object): """Audio Processor for TTS used by all the data pipelines. diff --git a/TTS/vocoder/layers/losses.py b/TTS/vocoder/layers/losses.py index 9acdeea1..848e292b 100644 --- a/TTS/vocoder/layers/losses.py +++ b/TTS/vocoder/layers/losses.py @@ -1,88 +1,12 @@ from typing import Dict, Union -import librosa import torch from torch import nn from torch.nn import functional as F +from TTS.utils.audio import TorchSTFT from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss - -class TorchSTFT(nn.Module): # pylint: disable=abstract-method - """TODO: Merge this with audio.py""" - - def __init__( - self, - n_fft, - hop_length, - win_length, - pad_wav=False, - window="hann_window", - sample_rate=None, - mel_fmin=0, - mel_fmax=None, - n_mels=80, - use_mel=False, - ): - super().__init__() - self.n_fft = n_fft - self.hop_length = hop_length - self.win_length = win_length - self.pad_wav = pad_wav - self.sample_rate = sample_rate - self.mel_fmin = mel_fmin - self.mel_fmax = mel_fmax - self.n_mels = n_mels - self.use_mel = use_mel - self.window = nn.Parameter(getattr(torch, window)(win_length), requires_grad=False) - self.mel_basis = None - if use_mel: - self._build_mel_basis() - - def __call__(self, x): - """Compute spectrogram frames by torch based stft. - - Args: - x (Tensor): input waveform - - Returns: - Tensor: spectrogram frames. - - Shapes: - x: [B x T] or [B x 1 x T] - """ - if x.ndim == 2: - x = x.unsqueeze(1) - if self.pad_wav: - padding = int((self.n_fft - self.hop_length) / 2) - x = torch.nn.functional.pad(x, (padding, padding), mode="reflect") - # B x D x T x 2 - o = torch.stft( - x.squeeze(1), - self.n_fft, - self.hop_length, - self.win_length, - self.window, - center=True, - pad_mode="reflect", # compatible with audio.py - normalized=False, - onesided=True, - return_complex=False, - ) - M = o[:, :, :, 0] - P = o[:, :, :, 1] - S = torch.sqrt(torch.clamp(M ** 2 + P ** 2, min=1e-8)) - if self.use_mel: - S = torch.matmul(self.mel_basis.to(x), S) - return S - - def _build_mel_basis(self): - mel_basis = librosa.filters.mel( - self.sample_rate, self.n_fft, n_mels=self.n_mels, fmin=self.mel_fmin, fmax=self.mel_fmax - ) - self.mel_basis = torch.from_numpy(mel_basis).float() - - ################################# # GENERATOR LOSSES ################################# @@ -275,7 +199,7 @@ def _apply_D_loss(scores_fake, scores_real, loss_func): loss += total_loss real_loss += real_loss fake_loss += fake_loss - # normalize loss values with number of scales + # normalize loss values with number of scales (discriminators) loss /= len(scores_fake) real_loss /= len(scores_real) fake_loss /= len(scores_fake) From cfa5041db7ac8ab9da6f56f1e39ee99b12140e8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 21 Jun 2021 16:51:28 +0200 Subject: [PATCH 217/258] =?UTF-8?q?Fix=20`eval=5Flog`=20for=20`gan.py`=20?= =?UTF-8?q?=F0=9F=9B=A0=EF=B8=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TTS/trainer.py | 2 +- TTS/vocoder/models/gan.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index ec6d4417..f628d9a4 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -36,7 +36,7 @@ from TTS.utils.generic_utils import ( ) from TTS.utils.io import copy_model_files, save_best_model, save_checkpoint from TTS.utils.logging import ConsoleLogger, TensorboardLogger -from TTS.utils.trainer_utils import * +from TTS.utils.trainer_utils import get_optimizer, get_scheduler, is_apex_available, setup_torch_training_env from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data from TTS.vocoder.models import setup_model as setup_vocoder_model diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py index 58d6532e..94583147 100644 --- a/TTS/vocoder/models/gan.py +++ b/TTS/vocoder/models/gan.py @@ -144,20 +144,24 @@ class GAN(BaseVocoder): return outputs, loss_dict - def train_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + @staticmethod + def _log(name: str, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: y_hat = outputs[0]["model_outputs"] y = batch["waveform"] - figures = plot_results(y_hat, y, ap, "train") + figures = plot_results(y_hat, y, ap, name) sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() - audios = {"train/audio": sample_voice} + audios = {f"{name}/audio": sample_voice} return figures, audios + def train_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + return self._log("train", ap, batch, outputs) + @torch.no_grad() def eval_step(self, batch: Dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]: return self.train_step(batch, criterion, optimizer_idx) def eval_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: - return self.train_log(ap, batch, outputs) + return self._log("eval", ap, batch, outputs) def load_checkpoint( self, From 932ab107ae571af612fa8d7f406ae13017dd4d58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 21 Jun 2021 16:53:19 +0200 Subject: [PATCH 218/258] =?UTF-8?q?Docstring=20edit=20in=20`TTSDataset.py`?= =?UTF-8?q?=20=E2=9C=8D=EF=B8=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TTS/tts/datasets/TTSDataset.py | 131 +++++++++++++++++++++------------ 1 file changed, 82 insertions(+), 49 deletions(-) diff --git a/TTS/tts/datasets/TTSDataset.py b/TTS/tts/datasets/TTSDataset.py index d0fbb553..0fc23231 100644 --- a/TTS/tts/datasets/TTSDataset.py +++ b/TTS/tts/datasets/TTSDataset.py @@ -2,6 +2,7 @@ import collections import os import random from multiprocessing import Pool +from typing import Dict, List import numpy as np import torch @@ -10,52 +11,82 @@ from torch.utils.data import Dataset from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor from TTS.tts.utils.text import pad_with_eos_bos, phoneme_to_sequence, text_to_sequence +from TTS.utils.audio import AudioProcessor class TTSDataset(Dataset): def __init__( self, - outputs_per_step, - text_cleaner, - compute_linear_spec, - ap, - meta_data, - tp=None, - add_blank=False, - batch_group_size=0, - min_seq_len=0, - max_seq_len=float("inf"), - use_phonemes=False, - phoneme_cache_path=None, - phoneme_language="en-us", - enable_eos_bos=False, - speaker_id_mapping=None, - d_vector_mapping=None, - use_noise_augment=False, - verbose=False, + outputs_per_step: int, + text_cleaner: list, + compute_linear_spec: bool, + ap: AudioProcessor, + meta_data: List[List], + characters: Dict = None, + add_blank: bool = False, + batch_group_size: int = 0, + min_seq_len: int = 0, + max_seq_len: int = float("inf"), + use_phonemes: bool = False, + phoneme_cache_path: str = None, + phoneme_language: str = "en-us", + enable_eos_bos: bool = False, + speaker_id_mapping: Dict = None, + d_vector_mapping: Dict = None, + use_noise_augment: bool = False, + verbose: bool = False, ): - """ + """Generic 📂 data loader for `tts` models. It is configurable for different outputs and needs. + + If you need something different, you can either override or create a new class as the dataset is + initialized by the model. + Args: - outputs_per_step (int): number of time frames predicted per step. - text_cleaner (str): text cleaner used for the dataset. + outputs_per_step (int): Number of time frames predicted per step. + + text_cleaner (list): List of text cleaners to clean the input text before converting to sequence IDs. + compute_linear_spec (bool): compute linear spectrogram if True. - ap (TTS.tts.utils.AudioProcessor): audio processor object. - meta_data (list): list of dataset instances. - tp (dict): dict of custom text characters used for converting texts to sequences. - batch_group_size (int): (0) range of batch randomization after sorting - sequences by length. - min_seq_len (int): (0) minimum sequence length to be processed - by the loader. - max_seq_len (int): (float("inf")) maximum sequence length. - use_phonemes (bool): (true) if true, text converted to phonemes. - phoneme_cache_path (str): path to cache phoneme features. - phoneme_language (str): one the languages from - https://github.com/bootphon/phonemizer#languages - enable_eos_bos (bool): enable end of sentence and beginning of sentences characters. - speaker_id_mapping (dict): list of speaker ids to map speaker names to numerical ids. - d_vector_mapping (dict): dictionary of d-vectors that maps each audio file to a pre-computed d-vector. - use_noise_augment (bool): enable adding random noise to wav for augmentation. - verbose (bool): print diagnostic information. + + ap (TTS.tts.utils.AudioProcessor): Audio processor object. + + meta_data (list): List of dataset instances. + + characters (dict): `dict` of custom text characters used for converting texts to sequences. + + add_blank (bool): Add a special `blank` character after every other character. It helps some + models achieve better results. Defaults to false. + + batch_group_size (int): Range of batch randomization after sorting + sequences by length. It shuffles each batch with bucketing to gather similar lenght sequences in a + batch. Set 0 to disable. Defaults to 0. + + min_seq_len (int): Minimum input sequence length to be processed + by the loader. Filter out input sequences that are shorter than this. Some models have a + minimum input length due to its architecture. Defaults to 0. + + max_seq_len (int): Maximum input sequence length. Filter out input sequences that are longer than this. + It helps for controlling the VRAM usage against long input sequences. Especially models with + RNN layers are sensitive to input length. Defaults to `Inf`. + + use_phonemes (bool): If true, input text converted to phonemes. Defaults to false. + + phoneme_cache_path (str): Path to cache phoneme features. It writes computed phonemes to files to use in + the coming iterations. Defaults to None. + + phoneme_language (str): One the languages from supported by the phonemizer interface. Defaults to `en-us`. + + enable_eos_bos (bool): Enable the `end of sentence` and the `beginning of sentences characters`. Defaults + to False. + + speaker_id_mapping (dict): Mapping of speaker names to IDs used to compute embedding vectors by the + embedding layer. Defaults to None. + + d_vector_mapping (dict): Mapping of wav files to computed d-vectors. Defaults to None. + + use_noise_augment (bool): Enable adding random noise to wav for augmentation. Defaults to False. + + verbose (bool): Print diagnostic information. Defaults to false. """ super().__init__() self.batch_group_size = batch_group_size @@ -67,7 +98,7 @@ class TTSDataset(Dataset): self.min_seq_len = min_seq_len self.max_seq_len = max_seq_len self.ap = ap - self.tp = tp + self.characters = characters self.add_blank = add_blank self.use_phonemes = use_phonemes self.phoneme_cache_path = phoneme_cache_path @@ -97,13 +128,13 @@ class TTSDataset(Dataset): return data @staticmethod - def _generate_and_cache_phoneme_sequence(text, cache_path, cleaners, language, tp, add_blank): + def _generate_and_cache_phoneme_sequence(text, cache_path, cleaners, language, characters, add_blank): """generate a phoneme sequence from text. since the usage is for subsequent caching, we never add bos and eos chars here. Instead we add those dynamically later; based on the config option.""" phonemes = phoneme_to_sequence( - text, [cleaners], language=language, enable_eos_bos=False, tp=tp, add_blank=add_blank + text, [cleaners], language=language, enable_eos_bos=False, tp=characters, add_blank=add_blank ) phonemes = np.asarray(phonemes, dtype=np.int32) np.save(cache_path, phonemes) @@ -111,7 +142,7 @@ class TTSDataset(Dataset): @staticmethod def _load_or_generate_phoneme_sequence( - wav_file, text, phoneme_cache_path, enable_eos_bos, cleaners, language, tp, add_blank + wav_file, text, phoneme_cache_path, enable_eos_bos, cleaners, language, characters, add_blank ): file_name = os.path.splitext(os.path.basename(wav_file))[0] @@ -122,15 +153,15 @@ class TTSDataset(Dataset): phonemes = np.load(cache_path) except FileNotFoundError: phonemes = TTSDataset._generate_and_cache_phoneme_sequence( - text, cache_path, cleaners, language, tp, add_blank + text, cache_path, cleaners, language, characters, add_blank ) except (ValueError, IOError): print(" [!] failed loading phonemes for {}. " "Recomputing.".format(wav_file)) phonemes = TTSDataset._generate_and_cache_phoneme_sequence( - text, cache_path, cleaners, language, tp, add_blank + text, cache_path, cleaners, language, characters, add_blank ) if enable_eos_bos: - phonemes = pad_with_eos_bos(phonemes, tp=tp) + phonemes = pad_with_eos_bos(phonemes, tp=characters) phonemes = np.asarray(phonemes, dtype=np.int32) return phonemes @@ -158,13 +189,14 @@ class TTSDataset(Dataset): self.enable_eos_bos, self.cleaners, self.phoneme_language, - self.tp, + self.characters, self.add_blank, ) else: text = np.asarray( - text_to_sequence(text, [self.cleaners], tp=self.tp, add_blank=self.add_blank), dtype=np.int32 + text_to_sequence(text, [self.cleaners], tp=self.characters, add_blank=self.add_blank), + dtype=np.int32, ) assert text.size > 0, self.items[idx][1] @@ -206,7 +238,8 @@ class TTSDataset(Dataset): for idx, item in enumerate(tqdm.tqdm(self.items)): text, *_ = item sequence = np.asarray( - text_to_sequence(text, [self.cleaners], tp=self.tp, add_blank=self.add_blank), dtype=np.int32 + text_to_sequence(text, [self.cleaners], tp=self.characters, add_blank=self.add_blank), + dtype=np.int32, ) self.items[idx][0] = sequence @@ -216,7 +249,7 @@ class TTSDataset(Dataset): self.enable_eos_bos, self.cleaners, self.phoneme_language, - self.tp, + self.characters, self.add_blank, ] if self.verbose: From 7c614525d90e024869231e933afa64299234f63d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 21 Jun 2021 17:23:24 +0200 Subject: [PATCH 219/258] =?UTF-8?q?Add=20AlignTTS=20recipe=20=F0=9F=91=A9?= =?UTF-8?q?=E2=80=8D=F0=9F=8D=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- recipes/ljspeech/align_tts/train_aligntts.py | 30 ++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 recipes/ljspeech/align_tts/train_aligntts.py diff --git a/recipes/ljspeech/align_tts/train_aligntts.py b/recipes/ljspeech/align_tts/train_aligntts.py new file mode 100644 index 00000000..4a4f86c4 --- /dev/null +++ b/recipes/ljspeech/align_tts/train_aligntts.py @@ -0,0 +1,30 @@ +import os + +from TTS.tts.configs import AlignTTSConfig +from TTS.tts.configs import BaseDatasetConfig +from TTS.trainer import init_training, Trainer, TrainingArgs + + +output_path = os.path.dirname(os.path.abspath(__file__)) +dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")) +config = AlignTTSConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + text_cleaner="english_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + print_step=25, + print_eval=True, + mixed_precision=False, + output_path=output_path, + datasets=[dataset_config] +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger) +trainer.fit() From 9790eddada5778556e0090ee5c0b657c187e1e8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 22 Jun 2021 03:03:30 +0200 Subject: [PATCH 220/258] =?UTF-8?q?Fix=20wrong=20argument=20name=20?= =?UTF-8?q?=F0=9F=9B=A0=EF=B8=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TTS/bin/compute_attention_masks.py | 2 +- TTS/bin/extract_tts_spectrograms.py | 2 +- TTS/tts/models/base_tts.py | 2 +- notebooks/ExtractTTSpectrogram.ipynb | 2 +- tests/data_tests/test_loader.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index 35721f59..cbb441fe 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -89,7 +89,7 @@ Example run: compute_linear_spec=False, ap=ap, meta_data=meta_data, - tp=C.characters if "characters" in C.keys() else None, + characters=c.characters if "characters" in C.keys() else None, add_blank=C["add_blank"] if "add_blank" in C.keys() else False, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 11cdfe31..b0159b86 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -27,7 +27,7 @@ def setup_loader(ap, r, verbose=False): compute_linear_spec=False, meta_data=meta_data, ap=ap, - tp=c.characters if "characters" in c.keys() else None, + characters=c.characters if "characters" in c.keys() else None, add_blank=c["add_blank"] if "add_blank" in c.keys() else False, batch_group_size=0, min_seq_len=c.min_seq_len, diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 1de7ba92..015d0200 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -164,7 +164,7 @@ class BaseTTS(BaseModel): compute_linear_spec=config.model.lower() == "tacotron", meta_data=data_items, ap=ap, - tp=config.characters, + characters=config.characters, add_blank=config["add_blank"], batch_group_size=0 if is_eval else config.batch_group_size * config.batch_size, min_seq_len=config.min_seq_len, diff --git a/notebooks/ExtractTTSpectrogram.ipynb b/notebooks/ExtractTTSpectrogram.ipynb index bdc7c955..4e42a3bb 100644 --- a/notebooks/ExtractTTSpectrogram.ipynb +++ b/notebooks/ExtractTTSpectrogram.ipynb @@ -112,7 +112,7 @@ "preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n", "preprocessor = getattr(preprocessor, DATASET.lower())\n", "meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n", - "dataset = TTSDataset(checkpoint['r'], C.text_cleaner, False, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n", + "dataset = TTSDataset(checkpoint['r'], C.text_cleaner, False, ap, meta_data,characters=c.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n", "loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False)" ] }, diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index cad89d09..9bc70ddd 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -44,7 +44,7 @@ class TestTTSDataset(unittest.TestCase): compute_linear_spec=True, ap=self.ap, meta_data=items, - tp=c.characters, + characters=c.characters, batch_group_size=bgs, min_seq_len=c.min_seq_len, max_seq_len=float("inf"), From a7617d8ab6ce1f39fe637fd1085d5362019f218a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 22 Jun 2021 21:39:17 +0200 Subject: [PATCH 221/258] =?UTF-8?q?Add=20=F0=9F=90=8D=20python=203.9=20to?= =?UTF-8?q?=20CI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/main.yml | 4 ++-- TTS/bin/compute_attention_masks.py | 2 +- TTS/bin/distribute.py | 1 - setup.py | 3 ++- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 74d5e85b..68be9274 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -18,8 +18,8 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.6, 3.7, 3.8] - + python-version: [3.6, 3.7, 3.8, 3.9] + experimental: [false] steps: - uses: actions/checkout@v2 - uses: actions/cache@v1 diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index cbb441fe..88d60d7d 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -89,7 +89,7 @@ Example run: compute_linear_spec=False, ap=ap, meta_data=meta_data, - characters=c.characters if "characters" in C.keys() else None, + characters=C.characters if "characters" in C.keys() else None, add_blank=C["add_blank"] if "add_blank" in C.keys() else False, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, diff --git a/TTS/bin/distribute.py b/TTS/bin/distribute.py index 873ddb1f..742c0197 100644 --- a/TTS/bin/distribute.py +++ b/TTS/bin/distribute.py @@ -18,7 +18,6 @@ def main(): parser = TrainingArgs().init_argparse(arg_prefix="") parser.add_argument("--script", type=str, help="Target training script to distibute.") args, unargs = parser.parse_known_args() - breakpoint() num_gpus = torch.cuda.device_count() group_id = time.strftime("%Y_%m_%d-%H%M%S") diff --git a/setup.py b/setup.py index b4015455..bd6a6aae 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ import setuptools.command.develop from Cython.Build import cythonize from setuptools import Extension, find_packages, setup -if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) > LooseVersion("3.9"): +if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) > LooseVersion("3.10"): raise RuntimeError("TTS requires python >= 3.6 and <=3.10 " "but your Python version is {}".format(sys.version)) @@ -106,6 +106,7 @@ setup( "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", "Development Status :: 3 - Alpha", "Intended Audience :: Science/Research", "Intended Audience :: Developers", From fbba37e01eb25042de78bf706ba8dea2251bd92c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 23 Jun 2021 11:08:34 +0200 Subject: [PATCH 222/258] =?UTF-8?q?Fix=20loading=20the=20`amp`=20scaler=20?= =?UTF-8?q?from=20a=20checkpoint=20=F0=9F=9B=A0=EF=B8=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TTS/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index f628d9a4..d5aec1c9 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -306,7 +306,7 @@ class Trainer: model.load_state_dict(checkpoint["model"]) print(" > Restoring Optimizer...") optimizer = _restore_list_objs(checkpoint["optimizer"], optimizer) - if "scaler" in checkpoint and self.use_amp_scaler: + if "scaler" in checkpoint and self.use_amp_scaler and checkpoint["scaler"]: print(" > Restoring AMP Scaler...") scaler = _restore_list_objs(checkpoint["scaler"], scaler) except (KeyError, RuntimeError): From 0a1962b583292cb844b5b2f899f28f8c8d9b728e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 23 Jun 2021 11:42:11 +0200 Subject: [PATCH 223/258] Update `umap` and `numba` vers. to 0.5.1 and 0.53 --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 7437b78a..d5624c3b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,8 +15,8 @@ soundfile tensorboardX torch>=1.7 tqdm -numba==0.52 -umap-learn==0.4.6 +numba==0.53 +umap-learn==0.5.1 anyascii coqpit # japanese g2p deps From d42d1c02eaaff88834a35ee5ff4afe8302921368 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 23 Jun 2021 13:45:59 +0200 Subject: [PATCH 224/258] Use `torch.linalg.qr` for pytorch > `v1.9.0` --- TTS/tts/layers/glow_tts/glow.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/TTS/tts/layers/glow_tts/glow.py b/TTS/tts/layers/glow_tts/glow.py index 18c491e3..7620ef88 100644 --- a/TTS/tts/layers/glow_tts/glow.py +++ b/TTS/tts/layers/glow_tts/glow.py @@ -1,3 +1,5 @@ +from distutils.version import LooseVersion + import torch from torch import nn from torch.nn import functional as F @@ -81,7 +83,11 @@ class InvConvNear(nn.Module): self.no_jacobian = no_jacobian self.weight_inv = None - w_init = torch.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_())[0] + if LooseVersion(torch.__version__) < LooseVersion("1.9"): + w_init = torch.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_())[0] + else: + w_init = torch.linalg.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_(), "complete")[0] + if torch.det(w_init) < 0: w_init[:, 0] = -1 * w_init[:, 0] self.weight = nn.Parameter(w_init) From b3c073c99b58c454ae32279f83b1d0a5393c89a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 24 Jun 2021 19:08:45 +0200 Subject: [PATCH 225/258] Allow runing full path scripts with `distribute.py` --- TTS/bin/distribute.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/TTS/bin/distribute.py b/TTS/bin/distribute.py index 742c0197..e05747d0 100644 --- a/TTS/bin/distribute.py +++ b/TTS/bin/distribute.py @@ -24,7 +24,10 @@ def main(): # set arguments for train.py folder_path = pathlib.Path(__file__).parent.absolute() - command = [os.path.join(folder_path, args.script)] + if os.path.exists(os.path.join(folder_path, args.script)): + command = [os.path.join(folder_path, args.script)] + else: + command = [args.script] command.append("--continue_path={}".format(args.continue_path)) command.append("--restore_path={}".format(args.restore_path)) command.append("--config_path={}".format(args.config_path)) From ab563ce7cda2e46d67725ad849a1e995268fd29f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sat, 26 Jun 2021 18:33:17 +0200 Subject: [PATCH 226/258] Start training by config.json using `register_config` --- TTS/config/__init__.py | 27 ++++++++++++++++++++++++--- TTS/trainer.py | 19 +++++++++++++++---- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py index b4f1cbea..ecbe1f9a 100644 --- a/TTS/config/__init__.py +++ b/TTS/config/__init__.py @@ -1,8 +1,10 @@ import json import os import re +from typing import Dict import yaml +from coqpit import Coqpit from TTS.config.shared_configs import * from TTS.utils.generic_utils import find_module @@ -20,7 +22,18 @@ def read_json_with_comments(json_path): return data -def _search_configs(model_name): +def register_config(model_name: str) -> Coqpit: + """Find the right config for the given model name. + + Args: + model_name (str): Model name. + + Raises: + ModuleNotFoundError: No matching config for the model name. + + Returns: + Coqpit: config class. + """ config_class = None paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.speaker_encoder"] for path in paths: @@ -33,7 +46,15 @@ def _search_configs(model_name): return config_class -def _process_model_name(config_dict): +def _process_model_name(config_dict: Dict) -> str: + """Format the model name as expected. It is a band-aid for the old `vocoder` model names. + + Args: + config_dict (Dict): A dictionary including the config fields. + + Returns: + str: Formatted modelname. + """ model_name = config_dict["model"] if "model" in config_dict else config_dict["generator_model"] model_name = model_name.replace("_generator", "").replace("_discriminator", "") return model_name @@ -69,7 +90,7 @@ def load_config(config_path: str) -> None: raise TypeError(f" [!] Unknown config file type {ext}") config_dict.update(data) model_name = _process_model_name(config_dict) - config_class = _search_configs(model_name.lower()) + config_class = register_config(model_name.lower()) config = config_class() config.from_dict(config_dict) return config diff --git a/TTS/trainer.py b/TTS/trainer.py index d5aec1c9..e3403bae 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -18,7 +18,7 @@ from torch import nn from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.utils.data import DataLoader -from TTS.config import load_config +from TTS.config import load_config, register_config from TTS.tts.datasets import load_meta_data from TTS.tts.models import setup_model as setup_tts_model from TTS.tts.utils.text.symbols import parse_symbols @@ -940,7 +940,10 @@ def process_args(args, config=None): c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does logging to the console. tb_logger (TTS.utils.tensorboard.TensorboardLogger): Class that does - the TensorBoard loggind. + the TensorBoard logging. + + TODO: + - Interactive config definition. """ if isinstance(args, tuple): args, coqpit_overrides = args @@ -951,9 +954,17 @@ def process_args(args, config=None): args.restore_path, best_model = get_last_checkpoint(args.continue_path) if not args.best_path: args.best_path = best_model - # setup output paths and read configs - if config is None: + # init config + if config is None and args.config_path: + # init from a file config = load_config(args.config_path) + else: + # init from console args + from TTS.config.shared_configs import BaseTrainingConfig + + config_base = BaseTrainingConfig() + config_base.parse_known_args(coqpit_overrides) + config = register_config(config_base.model)() # override values from command-line args config.parse_known_args(coqpit_overrides, relaxed_parser=True) if config.mixed_precision: From 6b265ae8e3e06453ad526badcceb3d0ff01cd77f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sat, 26 Jun 2021 18:34:37 +0200 Subject: [PATCH 227/258] Docstring update --- TTS/config/shared_configs.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index 801855c1..669437b9 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -180,6 +180,14 @@ class BaseTrainingConfig(Coqpit): among all the models. Args: + model (str): + Name of the model that is used in the training. + run_name (str): + Name of the experiment. This prefixes the output folder name. + run_description (str): + Short description of the experiment. + epochs (int): + Number training epochs. Defaults to 10000. batch_size (int): Training batch size. eval_batch_size (int): From 65958eaa41d8d072dccb931e9523fdf22ec842bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 27 Jun 2021 20:55:20 +0200 Subject: [PATCH 228/258] Add preliminary sphinx documentation --- .gitignore | 2 +- Makefile | 26 ++- README.md | 203 ++---------------- docs/Makefile | 20 ++ docs/README.md | 0 docs/requirements.txt | 5 + docs/source/_static/logo.png | Bin 0 -> 38890 bytes docs/source/audio_processor.md | 25 +++ docs/source/conf.py | 102 +++++++++ docs/source/configuration.md | 59 +++++ docs/source/contributing.md | 3 + docs/source/converting_torch_to_tf.md | 21 ++ docs/source/dataset.md | 25 +++ docs/source/faq.md | 114 ++++++++++ docs/source/formatting_your_dataset.md | 82 +++++++ docs/source/implementing_a_new_model.md | 61 ++++++ docs/source/index.md | 40 ++++ docs/source/inference.md | 103 +++++++++ docs/source/installation.md | 39 ++++ docs/source/make.bat | 35 +++ docs/source/model_api.md | 24 +++ docs/source/readthedocs.yml | 17 ++ docs/source/trainer_api.md | 17 ++ docs/source/training_a_model.md | 165 ++++++++++++++ docs/source/tts_datasets.md | 16 ++ docs/source/tutorial_for_nervous_beginners.md | 175 +++++++++++++++ docs/source/what_makes_a_good_dataset.md | 19 ++ 27 files changed, 1200 insertions(+), 198 deletions(-) create mode 100644 docs/Makefile create mode 100644 docs/README.md create mode 100644 docs/requirements.txt create mode 100644 docs/source/_static/logo.png create mode 100644 docs/source/audio_processor.md create mode 100644 docs/source/conf.py create mode 100644 docs/source/configuration.md create mode 100644 docs/source/contributing.md create mode 100644 docs/source/converting_torch_to_tf.md create mode 100644 docs/source/dataset.md create mode 100644 docs/source/faq.md create mode 100644 docs/source/formatting_your_dataset.md create mode 100644 docs/source/implementing_a_new_model.md create mode 100644 docs/source/index.md create mode 100644 docs/source/inference.md create mode 100644 docs/source/installation.md create mode 100644 docs/source/make.bat create mode 100644 docs/source/model_api.md create mode 100644 docs/source/readthedocs.yml create mode 100644 docs/source/trainer_api.md create mode 100644 docs/source/training_a_model.md create mode 100644 docs/source/tts_datasets.md create mode 100644 docs/source/tutorial_for_nervous_beginners.md create mode 100644 docs/source/what_makes_a_good_dataset.md diff --git a/.gitignore b/.gitignore index c4647723..1b174834 100644 --- a/.gitignore +++ b/.gitignore @@ -140,7 +140,7 @@ events.out* old_configs/* model_importers/* model_profiling/* -docs/* +docs/source/TODO/* .noseids .dccache log.txt diff --git a/Makefile b/Makefile index 70b7e34a..c7815f19 100644 --- a/Makefile +++ b/Makefile @@ -6,16 +6,6 @@ help: target_dirs := tests TTS notebooks -system-deps: ## install linux system deps - sudo apt-get install -y libsndfile1-dev - -dev-deps: ## install development deps - pip install -r requirements.dev.txt - pip install -r requirements.tf.txt - -deps: ## install 🐸 requirements. - pip install -r requirements.txt - test_all: ## run tests and don't stop on an error. nosetests --with-cov -cov --cover-erase --cover-package TTS tests --nologcapture --with-id ./run_bash_tests.sh @@ -34,5 +24,21 @@ style: ## update code style. lint: ## run pylint linter. pylint ${target_dirs} +system-deps: ## install linux system deps + sudo apt-get install -y libsndfile1-dev + +dev-deps: ## install development deps + pip install -r requirements.dev.txt + pip install -r requirements.tf.txt + +doc-deps: ## install docs dependencies + pip install -r docs/requirements.txt + +hub-deps: ## install deps for torch hub use + pip install -r requirements.hub.txt + +deps: ## install 🐸 requirements. + pip install -r requirements.txt + install: ## install 🐸 TTS for development. pip install -e .[all] diff --git a/README.md b/README.md index 92c2ee52..842a16d0 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ [![CircleCI](https://github.com/coqui-ai/TTS/actions/workflows/main.yml/badge.svg)]() [![License]()](https://opensource.org/licenses/MPL-2.0) +[![Docs]()](https://tts.readthedocs.io/en/latest/) [![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS) [![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md) [![Downloads](https://pepy.tech/badge/tts)](https://pepy.tech/project/tts) @@ -16,12 +17,10 @@ 📢 [English Voice Samples](https://erogol.github.io/ddc-samples/) and [SoundCloud playlist](https://soundcloud.com/user-565970875/pocket-article-wavernn-and-tacotron2) -👩🏽‍🍳 [TTS training recipes](https://github.com/erogol/TTS_recipes) - 📄 [Text-to-Speech paper collection](https://github.com/erogol/TTS-papers) ## 💬 Where to ask questions -Please use our dedicated channels for questions and discussion. Help is much more valuable if it's shared publicly, so that more people can benefit from it. +Please use our dedicated channels for questions and discussion. Help is much more valuable if it's shared publicly so that more people can benefit from it. | Type | Platforms | | ------------------------------- | --------------------------------------- | @@ -40,14 +39,11 @@ Please use our dedicated channels for questions and discussion. Help is much mor ## 🔗 Links and Resources | Type | Links | | ------------------------------- | --------------------------------------- | +| 💼 **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/) | 💾 **Installation** | [TTS/README.md](https://github.com/coqui-ai/TTS/tree/dev#install-tts)| | 👩‍💻 **Contributing** | [CONTRIBUTING.md](https://github.com/coqui-ai/TTS/blob/main/CONTRIBUTING.md)| | 📌 **Road Map** | [Main Development Plans](https://github.com/coqui-ai/TTS/issues/378) -| 👩🏾‍🏫 **Tutorials and Examples** | [TTS/Wiki](https://github.com/coqui-ai/TTS/wiki/%F0%9F%90%B8-TTS-Notebooks,-Examples-and-Tutorials) | | 🚀 **Released Models** | [TTS Releases](https://github.com/coqui-ai/TTS/releases) and [Experimental Models](https://github.com/coqui-ai/TTS/wiki/Experimental-Released-Models)| -| 🖥️ **Demo Server** | [TTS/server](https://github.com/coqui-ai/TTS/tree/master/TTS/server)| -| 🤖 **Synthesize speech** | [TTS/README.md](https://github.com/coqui-ai/TTS#example-synthesizing-speech-on-terminal-using-the-released-models)| -| 🛠️ **Implementing a New Model** | [TTS/Wiki](https://github.com/coqui-ai/TTS/wiki/Implementing-a-New-Model-in-%F0%9F%90%B8TTS)| ## 🥇 TTS Performance

@@ -56,20 +52,19 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models ## Features -- High performance Deep Learning models for Text2Speech tasks. +- High-performance Deep Learning models for Text2Speech tasks. - Text2Spec models (Tacotron, Tacotron2, Glow-TTS, SpeedySpeech). - Speaker Encoder to compute speaker embeddings efficiently. - Vocoder models (MelGAN, Multiband-MelGAN, GAN-TTS, ParallelWaveGAN, WaveGrad, WaveRNN) - Fast and efficient model training. -- Detailed training logs on console and Tensorboard. -- Support for multi-speaker TTS. -- Efficient Multi-GPUs training. +- Detailed training logs on the terminal and Tensorboard. +- Support for Multi-speaker TTS. +- Efficient, flexible, lightweight but feature complete `Trainer API`. - Ability to convert PyTorch models to Tensorflow 2.0 and TFLite for inference. -- Released models in PyTorch, Tensorflow and TFLite. +- Released and read-to-use models. - Tools to curate Text2Speech datasets under```dataset_analysis```. -- Demo server for model testing. -- Notebooks for extensive model benchmarking. -- Modular (but not too much) code base enabling easy testing for new ideas. +- Utilities to use and test your models. +- Modular (but not too much) code base enabling easy implementation of new ideas. ## Implemented Models ### Text-to-Spectrogram @@ -98,8 +93,9 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models - WaveRNN: [origin](https://github.com/fatchord/WaveRNN/) - WaveGrad: [paper](https://arxiv.org/abs/2009.00713) - HiFiGAN: [paper](https://arxiv.org/abs/2010.05646) +- UnivNet: [paper](https://arxiv.org/abs/2106.07889) -You can also help us implement more models. Some 🐸TTS related work can be found [here](https://github.com/erogol/TTS-papers). +You can also help us implement more models. ## Install TTS 🐸TTS is tested on Ubuntu 18.04 with **python >= 3.6, < 3.9**. @@ -110,7 +106,7 @@ If you are only interested in [synthesizing speech](https://github.com/coqui-ai/ pip install TTS ``` -By default this only installs the requirements for PyTorch. To install the tensorflow dependencies as well, use the `tf` extra. +By default, this only installs the requirements for PyTorch. To install the tensorflow dependencies as well, use the `tf` extra. ```bash pip install TTS[tf] @@ -123,12 +119,6 @@ git clone https://github.com/coqui-ai/TTS pip install -e .[all,dev,notebooks,tf] # Select the relevant extras ``` -We use ```espeak-ng``` to convert graphemes to phonemes. You might need to install separately. - -```bash -sudo apt-get install espeak-ng -``` - If you are on Ubuntu (Debian), you can also run following commands for installation. ```bash @@ -137,6 +127,7 @@ $ make install ``` If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system). + ## Directory Structure ``` |- notebooks/ (Jupyter Notebooks for model evaluation, parameter selection and data analysis.) @@ -147,6 +138,7 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht |- distribute.py (train your TTS model using Multiple GPUs.) |- compute_statistics.py (compute dataset statistics for normalization.) |- convert*.py (convert target torch model to TF.) + |- ... |- tts/ (text to speech models) |- layers/ (model layer definitions) |- models/ (model definitions) @@ -156,167 +148,4 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht |- (same) |- vocoder/ (Vocoder models.) |- (same) -``` - -## Sample Model Output -Below you see Tacotron model state after 16K iterations with batch-size 32 with LJSpeech dataset. - -> "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning." - -Audio examples: [soundcloud](https://soundcloud.com/user-565970875/pocket-article-wavernn-and-tacotron2) - -example_output - -## Datasets and Data-Loading -🐸TTS provides a generic dataloader easy to use for your custom dataset. -You just need to write a simple function to format the dataset. Check ```datasets/preprocess.py``` to see some examples. -After that, you need to set ```dataset``` fields in ```config.json```. - -Some of the public datasets that we successfully applied 🐸TTS: - -- [LJ Speech](https://keithito.com/LJ-Speech-Dataset/) -- [Nancy](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/) -- [TWEB](https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset) -- [M-AI-Labs](http://www.caito.de/2019/01/the-m-ailabs-speech-dataset/) -- [LibriTTS](https://openslr.org/60/) -- [Spanish](https://drive.google.com/file/d/1Sm_zyBo67XHkiFhcRSQ4YaHPYM0slO_e/view?usp=sharing) - thx! @carlfm01 - -## Example: Synthesizing Speech on Terminal Using the Released Models. - - -After the installation, 🐸TTS provides a CLI interface for synthesizing speech using pre-trained models. You can either use your own model or the release models under 🐸TTS. - -Listing released 🐸TTS models. - -```bash -tts --list_models -``` - -Run a TTS model, from the release models list, with its default vocoder. (Simply copy and paste the full model names from the list as arguments for the command below.) - -```bash -tts --text "Text for TTS" \ - --model_name "///" \ - --out_path folder/to/save/output.wav -``` - -Run a tts and a vocoder model from the released model list. Note that not every vocoder is compatible with every TTS model. - -```bash -tts --text "Text for TTS" \ - --model_name "///" \ - --vocoder_name "///" \ - --out_path folder/to/save/output.wav -``` - -Run your own TTS model (Using Griffin-Lim Vocoder) - -```bash -tts --text "Text for TTS" \ - --model_path path/to/model.pth.tar \ - --config_path path/to/config.json \ - --out_path folder/to/save/output.wav -``` - -Run your own TTS and Vocoder models - -```bash -tts --text "Text for TTS" \ - --config_path path/to/config.json \ - --model_path path/to/model.pth.tar \ - --out_path folder/to/save/output.wav \ - --vocoder_path path/to/vocoder.pth.tar \ - --vocoder_config_path path/to/vocoder_config.json -``` - -Run a multi-speaker TTS model from the released models list. - -```bash -tts --model_name "///" --list_speaker_idxs # list the possible speaker IDs. -tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "//" --speaker_idx "" -``` - -**Note:** You can use ```./TTS/bin/synthesize.py``` if you prefer running ```tts``` from the TTS project folder. - -## Example: Using the Demo Server for Synthesizing Speech - - - - -You can boot up a demo 🐸TTS server to run inference with your models. Note that the server is not optimized for performance -but gives you an easy way to interact with the models. - -The demo server provides pretty much the same interface as the CLI command. - -```bash -tts-server -h # see the help -tts-server --list_models # list the available models. -``` - -Run a TTS model, from the release models list, with its default vocoder. -If the model you choose is a multi-speaker TTS model, you can select different speakers on the Web interface and synthesize -speech. - -```bash -tts-server --model_name "///" -``` - -Run a TTS and a vocoder model from the released model list. Note that not every vocoder is compatible with every TTS model. - -```bash -tts-server --model_name "///" \ - --vocoder_name "///" -``` - - -## Example: Training and Fine-tuning LJ-Speech Dataset -Here you can find a [CoLab](https://gist.github.com/erogol/97516ad65b44dbddb8cd694953187c5b) notebook for a hands-on example, training LJSpeech. Or you can manually follow the guideline below. - -To start with, split ```metadata.csv``` into train and validation subsets respectively ```metadata_train.csv``` and ```metadata_val.csv```. Note that for text-to-speech, validation performance might be misleading since the loss value does not directly measure the voice quality to the human ear and it also does not measure the attention module performance. Therefore, running the model with new sentences and listening to the results is the best way to go. - -``` -shuf metadata.csv > metadata_shuf.csv -head -n 12000 metadata_shuf.csv > metadata_train.csv -tail -n 1100 metadata_shuf.csv > metadata_val.csv -``` - -To train a new model, you need to define your own ```config.json``` to define model details, trainin configuration and more (check the examples). Then call the corressponding train script. - -For instance, in order to train a tacotron or tacotron2 model on LJSpeech dataset, follow these steps. - -```bash -python TTS/bin/train_tacotron.py --config_path TTS/tts/configs/config.json -``` - -To fine-tune a model, use ```--restore_path```. - -```bash -python TTS/bin/train_tacotron.py --config_path TTS/tts/configs/config.json --restore_path /path/to/your/model.pth.tar -``` - -To continue an old training run, use ```--continue_path```. - -```bash -python TTS/bin/train_tacotron.py --continue_path /path/to/your/run_folder/ -``` - -For multi-GPU training, call ```distribute.py```. It runs any provided train script in multi-GPU setting. - -```bash -CUDA_VISIBLE_DEVICES="0,1,4" python TTS/bin/distribute.py --script train_tacotron.py --config_path TTS/tts/configs/config.json -``` - -Each run creates a new output folder accomodating used ```config.json```, model checkpoints and tensorboard logs. - -In case of any error or intercepted execution, if there is no checkpoint yet under the output folder, the whole folder is going to be removed. - -You can also enjoy Tensorboard, if you point Tensorboard argument```--logdir``` to the experiment folder. - -## [Contribution guidelines](https://github.com/coqui-ai/TTS/blob/main/CONTRIBUTING.md) -### Acknowledgement -- https://github.com/keithito/tacotron (Dataset pre-processing) -- https://github.com/r9y9/tacotron_pytorch (Initial Tacotron architecture) -- https://github.com/kan-bayashi/ParallelWaveGAN (GAN based vocoder library) -- https://github.com/jaywalnut310/glow-tts (Original Glow-TTS implementation) -- https://github.com/fatchord/WaveRNN/ (Original WaveRNN implementation) -- https://arxiv.org/abs/2010.05646 (Original HiFiGAN implementation) +``` \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..92dd33a1 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..e69de29b diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..73abe83f --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,5 @@ +furo +myst-parser == 0.15.1 +sphinx == 4.0.2 +sphinx_inline_tabs +sphinx_copybutton \ No newline at end of file diff --git a/docs/source/_static/logo.png b/docs/source/_static/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..6a1185c0966f9731a0e0f1878cc95a757d97107a GIT binary patch literal 38890 zcmeFYWmH_mnAtTs1 z!VdrdEvBEQu7|prH-(Fvvz4ubC54BtizS7nkF6B|;Imd=V4Fq5_pai_9j_a9iXwaP zTS0rcaZqqJSt@M>S4GWo1&ee9)yAYPtP29B_QliIOXtPQmCEVxN5i^BnKA?_oWwp?j6w*{pE-zTrOjeQ5BTp7q_T zAe|oLr<-7Fi+isfcl8AN$o)qmjVBh0rT?14aFNM!p9sI*c~ljlGy@ zT^zlAdSHAyycdO|U7Wro?R~hhS|Z;He*CE#)cttj_>Ejdpq{+wscLH)cW}D>=GSHS z$4>w0qYIgL4z|BOR9_nSihh&e6|K5|KTf{4H86em?C}hgyTrl+A%*F1@yXv@M7R-xB^%siRdSGJ(}If9t=wpD{#KmLj?h&Z^8wrl$Qa!=bP z;0nEq3!V)>16kSRnK(Gezr;WDn)2b!qCrmPdT>T6@FP{p9t?R zcZX{H!Uzp)Nk)gB5_Vojg3ZnML1O)k$VufAPprgnbYp}uq3r$WG|Is5iaze|g&l5l2> zO<|w@SKL#;8yGTEtOi;jUPE6Id6(lOiYPU4qc@5`bNWUt39I)CQ zHFwA8|9xFs(c4+~NSD96NQuL|;OFPz&7VFxUG(d`5=FFarx$gxZ$Hf~y-qB>vE|!h z6K&~muf4T6Z`wV#uo_sK?xJ7Xv|lqj6wzCBBF0AgANKkWz z&9D-vF@4nGV@x*)i3$+R^h+qQ;!TUA?;q6$tDLwKSPxzCUI$$dNT~gE{{hc4Ha5$wzeU~1T5>z*FF8-e3lSoJvY*U|EU5r@!I#|jSMBMJ8g=o7E0}*$CTX;OV0EN z>tp@S!j%44Vu$0IIOhk_I&hqg%9oa(V|b&S9oOIAOUTzY>zu1QwSURIiD*54M+4o~ zq^u%?rqNnXP(Xem@X?ctK{@U%gOJ1Koj2}otD&AWhg~pk>_@B4&K$MJrp355i zxQLh=RCCRpVR^WXCG)c~r?lHbOTp;HJ|127FrluNO6i`P_VkUmi-+Ye$nWhQ-ih^s zbes6#IzlsN3}TSO@iO#=FnsZ z?kfc^F_icZjetI8ERGIOq0FDfkt11f*h~qwTmxiQigVQAblSh*S{)wlsHMxiljULk zP7D8uc$TPZ+PN>l>I9vl$AQn=;IUR42T_r?GZ&z+SGtTphWvxh9IKdCuOKGYWA+{l&9UdVe9gLE)h zen_a{lC-NVCi=349AGzVNavrT;hJy4nn$f!?eE#qaG5HsvxLx|HCJb(a2q6>@6 zWIpf^L)ae$Q-Q}7ZUlGlf{CE|ZFK|r@~3=WoreT7@gea{;ceh`o!}5!S8az}y8$z! zFwEELxl*rmsmZH7khvrSwqhtUgGselmdD+y`hB?v(!wh@$87=BP@pq_oI1UDsD6w>|_S6 zs68UWcmeIS<~dCc?of@t^UCa(H@-QYBbDX?NllGS*Gabyihb7mHN5_{N3hx`nFQ}6 zuyyyP%Rm1z*5~km{-OFSonbLZZg5Uc19ziGC8rVW79I#sXu**Mt=fm^>7KbhkFrLi zy)aNcK)t7JetG@TfM5W17+!E6{=HebGYqeq>)jWIpCAF zoutFN5;Vbc`)^$z+`;;Hib9s1^ zNEGzAuA^*1`156z6|6NRWp-MyH;Irw@&@?K)*0c_V@79uFbyyg9-EFMPlcGa^qw;YHy;6lz7F};S1 z)6sNC=!H~8v!b*nNRjP>B+*)ABd#XEW!2GCW75GELHahStHNr>@#|izSdHI4(aA&m zQ^raw_J}KNgxmOD86g2rt zGT2JAh$3d@gcvdV70-Pso`<78G2Ft<3H_P-O%bY0YussG!`J4-u+fG0oO*TujSB?amym zb)KpWD+KSOJ=L3FB4bjM0KXI-lR9zy(732l=$!!=|*!dz2poLzvBRF_@9 zg{`7lfX2lF+kAnUIH0})SA6G#ga~F(KT&;%{=v9R0SgVsX2cQg!`i6)?p-`v=WgYI zOq>>opOu7W7J)6Zlc$&lle{_3m^71o8#P>nry;43gz};m2esH!oT;;JGO~l=N}huA zF+>N`QhKafD}m<2`nWru4WB^s@(m5p2WL|tqZ?6LlyWpd6}_CdQJ}9~w@;e~&6u3% z;@!&(teAL14@r9)mmRg@AayE1LV}Gw9cui^d29D5LB}RjB+Ro}eq{mvb@`qdrX?e~ zY+>=~SW=x8lHBm7M(i}`n_V5_!0jjz*?SCKgUlEhAqRXtxX>#ze&$aTfXAEtYk5um}DVG?}3-5 zfA~3!qU<{zyu%qm8@yD>-A;eZW08apSUtYxa*BTNUK39f&2OS9MRjF}NB*UPELtt5 zJo$#0*Q^EMDr%ny9({vB$<;g@wcP`0oIq_v*r}~!hEa?~P#}h-S8W;Ttu$0(y$04k zmBK>>iGHt{Jp)*-vb!{+R|oqpzD?4x`0OeQCp~-@uCBmX2*x}SwzFocvr~UWl&Wdw z?xpl?pSs zL^W9`zMK>N_39yB$5+I!HWWe4j6vDVh-aMaQcPlQ5ya;Sd9U6IsNPdlm}?Nkt);Ai z39v#JkF^C=Va`%6OWvBzAYd1Nm7|bmxsH2KmE5*_i6M_e!XD}+$nc^k#k~GnmOLe1 z{QQnV-lR+HqtaD~f@8+#)Zq7tZ^i>lYwTV5g^T(#CK9tH4o#1tW)~Jj0ydC5(P0QA z@=K@$F}GH<(v2@-K)e#5T+Xir^gYgInT)(WOG@uGFxq-*|ZT8G9!e1C!7yTprCl2q2NZ@uGhAH=|%XJ~D{GZ=8FOg4&l?RTNIBP3eYAWuXy=(meAAmT;w zHTC&XGUbAMs`{2=pr%BCSHC5i^OZW=28jDaW7i5?Y>w z&@WCjyi#ySn7>_;waY2;f+=4|29i%q)TyO@g)!tO52+!{iubS5&iNGe>)OKe^{hew zAvS=9sRrXQM)T_ASt6wP3ir#{M`=ufV$M#4XtRjWgMmZU`7FPY!^d{=b5$2YJQ@w! zfy!$TQ|q_V&obG|YP2fyjC@+;5B0fd8}X!yN?%@gP{olOBS44r2k0k3b?OgjW+ON_ zK}}G0yMg>q1K>)S3BC~S3|o84!SA%f8}dVV%gJsqsAQBUET-XJV-PqGk`HqiTgG>UMihkQ$}^Z%mkNWAl?N{ zbLvEnz4aU3Pt|K_raZ&JCPxx@kk7gZ$qag&yqNZYiR)a*wpicp!9*k~@ouKSiWtMp zx+QWkb$}tj%&q+JBJz4e4_Y`v$k)Ruvr;D6`?^2e@_nW3p_!Gu&i5(%B{GpAX1O6O z7)R*sQx0(~arLqLHc%L^6bUSe&NguX?3$-yyc6NxmZPx`(inz=XATQ0A??959-9G_ zhc%MJty8FfAJ?i!>w2W6n+Xsy0Yv}pmv&HXZH9uakF)7s-yaPuFg6M+Y?oAF; zjLtAgPt-D~ClI|1>(sy@D87?V1ElKChB}AxgW`2vDXqrw-N&{NbHvrYuFHS&aq&-Y z%I>wyW|2SDgc6j3an-WJ2#B17e;{u7ymuhnsnow>1B#>|>TH1}E#v88Qn%ls91E?X zBgCIRT7EhjKobw-szK$>1t3WgFueVl->=n2mJPk(%c7LSPfgXPbRrk`S(n^z${O{< zha1&UbWjS46!Gc%KmfL^s|p5Xc&WC}zDP|mK40X;93{6WqBACSWEc%p6QTxTc*V5h zRATI-pmIGod*)0dC6+z7@fVbrsNj7J! zA+g8TxRxo+mY#cqhIZG_6>7UZL4XNbUNbPi{34g9Tosp0yqpiNl#J#LoL8#BAm5Hs z=!MS6Er<%SFO{Nn4(+hEI(X4h6P%4g-=|;~vJ=-gA#XnaA5$ zoV+tOOSOg6x$0MXmfY27FmVlm242~p{og3R%CoDsXCd9#jCF zrPKXJ!1Y%ljO$vwxCUk`X2g;7UM!NS43G#Ii$5jv8Qb_>d$Hm5lMV1i|dabgV@WE4gjEwXdGK zZuwhtPxz#how(|A?uRHGMX-tP`{Obtx{wF+q6j@3`3MG`uwhzdEZub-F2}IJRPz=3 z3#1QxU^4X!#XOu{*N3o>5NL0z@W!y__c}+?K1POLvKscW*^YshrwvpmPNfzx^c5le z-z+qO&dO!?!pn}n{xE^wdOkFis@_9cES~#<0#vP~q_k4a#SD5PnRxynd-7cAt3Y>{ zSY0$6acG|zpXgM9@IEB4#%>s=QdV5qKg5dpL||OF+V!!Tn`Bj19UCD*3C?&fwNqX( z&@TawM*z$L#>K+gBXvIW5Lb#{pz=(B662}9{1naONhJ;rV>Yb8e$lieMkIJ-)rAT> z^IY8E6DVOMVbN4`XRSWk)T#tE_PDJqaB!Z;0Xg!o5tu^^glMp1^4(*6pfz`7jVe=Z zYALzGue77pGg0?jBzAk3>`iOb5YW;q@wSPM=OtW;%y;i#>%Gt#^bPcBX6h`eVi;5% zw=>a|Bk*`=*nE=a_Idp2VQl62%eNxa%u>Z`)KvMPy=lkj7{X}`$g+dbEgwjEkBeSU z3igB|F}gmd{Kx~52n^1}o1YMA&YTtpZfzp*sL!hQ3uIhIDauQ!bWw^na(Ra<(Wen!tG2pt)Ki63l|cdBF=?>3+~LYnLInP+1bsgBfm95;v}~iEuQ1XzM5Q ze%r_iwk=pRjJephVTg%8u`aZY( z>OEvS3=C~Y*sTnLLGV$HUeY3kYRY>glJKN(9`R_g=+mqv_&`VQUc!zbG<&;l4<{-?L2+h<6>- zi6y)@iPLn}fX7(k@h%M87>OP7@GKTsla~xyiXOZGizqJikl#rEK{#!PiDAC&JX`7U(_TeCYQDRK=_* zPz|&8H2b$?Qz`jasW3XeJ}d$ezISzxk?RDi^wetOzN)n^1uiJ6^Q~{}Qp%;PuhDrT zB-ACBYmMF+qthhlR9Bs7j>8iws>v_@K!dd!6*0^uU!6-V&QY{R;NB(JjQC=M232Cq zssKAi*)~eHn~C7BwE2lQdvOq(GM3$jkE8HKt>E`bAV-iHi}l5`r>l zZ$$V9->EjkSbPqXM$jPoC?1Mm`8bN=N$N{q5GCah7(D0Tv|e^Z+IO;5h&4$)S&@G9 zxRIBsPk1V8orO!EEK;v$wqQRgp-g0~X|GMKyFUgK9}Tj@@0? z@h1GGZ~x9@!*cv=+$Y3QZnOk`U(7dOnF#YlbZR`M3$%;yT7Fb*Q6dCwaCG1Ns4VGH z$(rh*W#_HLT`21;n0+hUH1UH)@}Bo}prC4TyUIK%r?7SP-J*MY^e1Vw+mBsIo#eNl zA10~|?mr!hpzmR8Fwi?pFw`C^C6h+nv0!Jvks$urH%Z606#(t;QG1&Fv1hAVlE^>n zF#Ga3=raB~Ia1tu21~aA53#>Bj5i`eBUcYTM{;FHIhLk_M4>WEbjqrTkHzo)RLNfC zXX@L1Z9nEv(UAJOnY8aGrp9PeLQHojQIQz$@42TD&Qs`lqi>{oTXOCCT8Y=fs2L4M zXFQA}JepAOZLYYc5qQc@C_Z*o%RMy@qzjq1LLr)>iVd1J>x1f4u6MsmXpQ&SrbkNE1*zuqkSHI0$ zu2DgC#e~=GqnQuai5m$!D$q$BJG}wiDGQ}=R6H=3=C-s$x3}61-7EK_YGSp|!JHg7 zk!0nPmX$n(ijgV^&3nq|m`3$My(Nx@@7bY$!ww3k(8ezFaC;3Wloq8ODCMP;D~kh6 z&67p*qruu45-~Pe_%TOGGHyr8=SwY(PGKIhUAV=Qmb6ylz$|jhyYW5K|2t6;*+ben z2sDg=iIO$GM2uQW?>l38aK{)3<0)3Mi|N5l7`{Rq`;f>)VoB=R(}Q&uvdWYERw9Sy zvx;C|7VV49u>GW{_N|QzZ+q~lgdF zB60N^54#BZxj;!HsXp*K&VqLy$jy2I7Rz=A#e0-@_?|gA6Ze`rqeLt^+%l3zitdEf zhp7^uyF{HX53Pwb`;T){YJEeygO1?P1G;#!Agg~oAO~;BdPWfD;G{6kUq#YB$Uvnr z)yaz!EvDY2`$okXX1dh>5?3ssTf-Xn{*%NrSt=dvy(*fjJ9)s7FOrI>v;L$_ z@)$=p_}txQpN>a4(ROSDlv_BQ-?xwHA9KJ5mLHO}DjiFcGU{-c%Cud^lQs6&h6-f| z4$P}3`gLYd$-o)KFr}?r@~KDVy^aHsr6$a}LBo4#NqL08Ur8>0g+_%1dV4cY~n@BmjVNAFFm_|Rf|E!ojkWut1SqbI~s0ng1N`y3}2!j5W23~wS!Pk@pmJTqRZLf9q) z>KIa+rM{FI{(!v)WPErIVxzK_(+Ja7D|wMdb6txIee&i6SqVn9*o4qQ>jCpSEm0Ne zQT1sN8eWS03YLJCV~hO>2`kr@;mdn)DtgPvKo2w4)pst(g+3hUvY=X~dv<{eN;Ii5 zj?#*oCJzZDmJ;O~rt7TF=XR9ar%7vf)~Z>iqHYVJhMLf8np}}UGo1)vi?gu32B)Pd zTXTMEtsO_dLJi&5Wmyfrl2^2%;Ti@AMuJJoaqcXyx|I4ZhFFA`r_sX>B50oxaKyJEny986EUj3%7Vj9lg?7O|P+?t1wnPoULs3 zmPScwEvvHT<|g#+6UM@03jDsDl6PFvhcpsuqQhzH3=D(XA<#Q5pnjZk-RNGK?|pG4 zW`3C7*kB0{+5yxLZ_;iOr7E0+==5f~lxAUqLSVmQhO#7@Jy_XVIp@tgS;GLgmEDMW z=U}VhX{aNnqNCI#M5SpdmRW~fF`QO-WZZ<9Yjl9Cz*nSj8N5NeOcV;*mKTpeko_#~ z&;q8IsbG{|bXZoX6CC@a)dgdkaU654Mj$B2ve!AXf8@bW3MmWb2R^x`=T%uah6*N6 zVm*`|%g!7{qm}4@Yo%<(_2QoT6-6t*4BVGSmKzE(##uCEYcOUdzFU7ujX;@dF$_>K zYDbJ%fR};F1VC;GYcExGU-Jp(iZxWR|L%B%6Xyc@=f|O(CLXjmw8E#d9|(kLPuBcZ zB`g#!Q-uf#&`h<2>nRJr_Hp$pm;&$Irztx%SN(+R`>hk&-)obsx`ZmkT1*OmMH-H5 z&3qQ{fx9*Gkq z_hd(n-2KEg$v-bav@^)FuX%Cu6s;XkgF4ucInVBaP#vtymGmX=$fj&Nnj-TgZbls8 zsq)CMX|k5=f!OA!qoTd?=80O*BMl$OZ+X^x9Gw{&*=97yx6;^ zvxQgdUzenQ`nMwJ+T`R{STplPzL1^w*W@kmL+7^B8aN||vA7cXAol|ql}XTBmiDsXIy`jfRs zT<%QpNr0%^Cm(t5SZeOHsgm!GYiON<{2dygCW2foNn%k|YdI%_Y@)8?jE{)UDPNR~ z>!)0Rn_{#^=P{C=FlwFl!0Nk*X-TUSw-b4|o|2EMj+A*xK@lW*PJNZxBOk&+wSFL6 zc|IJVh5Y;>QS3LrA&y^{Nff}(jDjOpr?xng@IT~d^!V4_nq7$HeOO5PMm5y~FQJZ2 zUN6;MExIq*G`%&A$BRI@pWaK(az+dHd?T!@Yw@`@h>T_3&rn*Wt}AIoaaT|@dioZP z$6KdV#vs+^@~>)6myA#C!1|oAIL9D7;2) z1tK)#V@gdRuPv|A%Ms#w*(bV%GoG)$(p6@~G7HpCuk!-o$IJAaJH5^v5}Ly*8HI4J zY@Cei<6@+h4GeAzN7TP8X$6)Rsqzmv$)laVAJ7<3lngHVG0O3bErX8X;UZe^lcRs* zLiAx}Odk2JX5-Tw&RB?GxEN#3{paqH%MdG4fAY)+q-Q z1HmdD1%miFv?NV@%&n5O$#GLP!zz%p2w6So2i53<&%Jf&x$WA0tL*L%%i>u% zIz;@*?9_6}d2ypM#8j5)WvrjGCeG5o0&c|)E_%u68@6}nLWYlQ-TEqR%H>tk8#Xl_ zgp|;~e%~vX%quT|)j?|bo^n2?lVObKeR_sQTWsYM@NP;@_SrhmMuYvMy~5tppvDdP zVDM$s;`$3r7|dym5U1q^WQ&y5f4Voy*2Tl(_ueSsIC1n4F+`Zlt^s-gZLNs)px;V9 z!VCGwAzcZ|`SGrzjCx!pmxxbJ9MsySZ3C#k{z;lYn!PjB<+nZ&HZ zhO3VrdzyQrF@`V3nveC(jfiW67ERI)}wM>E`oj3_4{)8~g zJxA>;rf;aOnrb}b1lMfL1?~K`ZSbe->Af793);zmU}tknHXlb9NNE57LZUt{W)}9A9u(%5*0xT8h*-}7Jl{?0$?gp5o94B5QMZQ#KT^g zN>@phLekmIk^;yEWaD6!_ObQiq7p%-5OM=sfz+jB{y_mb5~i~8@NfaKvwM4cvw3r~ zIlEc2a|#Fuuyb&+b8)dk5UlRLP9A1HtWNIKzbXFWkg{~QaD&vdt+NxwZ%#9FXHO4d zDk{i%ihq#*u6Ai3$lyO5{679e@9qI+mxmkxA@hL%*ts}3_*pr)Sh)n)|85UCtEBW# zYbW=As0h)M-N(#@os*4&-O=&iEZjY$z5eCzKecezgltk|SGRO`_H?tbl=iZ8@}T~^ zQx^wM_rLq}bhrFH^e1iyuoXKbs6U?nZX+wNr20>r-!fX;I=cL^_>KNM5^V8LoQtQM z!ygRTg5A==(h(AfJA|3@-|!x`R{u)Szs=|Ong1;ih`WFC{~P*0^!gLaA6+4zfLM6` zb}BC=O!Yfn5ZKwm77Y6HmYb8GkIM?o%gO;3;9><@Sn;q5aB*_5@^e{mahmY~!5sYD zf1{Fja`!NEvatM31tDj%h465gb8!lobMUfSK(fmU4=ZOkM@W0wI)RA_nb{8k>KPJBu1AFn$vO!Yfjirt9EIElCcxf25+I__GXv%q;$r-`&j1 z68t9x5FP(IWnp9HWNisqZ~xJ3|1oa+pA@p0r2v?Z*W8>H%*O|01#(#Mu$qG{%via( zxy&trKr?QB4uOAXcXzh(@HTU^6t{-d03^?l=K7Oo3Wk5wALGB{y=^Ri7Z(Q?Co7~u zIDnd*ydVw^5El>CU)iP*V*kCu{>N;Eey>4FN}#{%A@qBV0?Gd_L=8_D7YAEQw}02^ zKkDZH1@|}mf7Iyz$^7rIzpN#lU3?*3Y~!Kg?et%||4)E_F}$|5uyk^F{;xv+JLE4} z{&pdP#QfJ7r|mWv{#)SxsjmOuC7>74kl$8R!{QfWQu1bfD zAi2otxdQ;`gTG%;pPb)bJZP#NrdVI!kWn7Da#Ln!BVnP9c)f8l%Ke<^UOhA5Fy z)|Le27D_@vJPOMd;z5BX_pLWAM2dne=B9W3IUC0X0ACD1(+xp-03^RG?gY2N)Qh3a z><9AzZpBDlklsSogfdlVW|J^d=BZ&rQzC=YU|s{_o*J6OC^_Iq5t?QCrr*Lbg}4=? zb^r=)dZ(BwY7XR3sEEa11(fiI9QOfDnK)KonC`6UXO&@Gi1-hJ-vE+eK4qv^wqlBd zsS$VLX^C@CK0{eOns6WYWYQYOV(>$g-1VYEA!MraU=#huT*lI(>!Pwi<3E_B({L@n zMrxdXSwlHfQ}C42g-uQ5(BDr(U_r=wG*SE-bj60sM%PHZau9s3si39s`#dJA+Ze1i z4D+vsN9@cPGtsN4mo_CvPk9Nd7>9^dnjEb@{cS(fOeyeCBk2mXgSLWWwhJN^G3DHT z8r(W!9fA|!nudwGP+{A1o4v1Ap;{j)mZBLeN|Ix^Z?X@5n=V#v;Rd-8%SVsVQ-6RG z!r71e8F!m^Tl+)NAYxg%V_$q9ib8cdHacW3s1c?k(B6Rd`ClY5n10QKKIlH$jGp3u z%v@k`FxJu54+s(|A_iLeibAV0)zAM}t;k!UT|sS=eL&GhmI!_e^$aUik+H_KgDJ%e zt$%8`J=hl=Qh|+S7UGNY0CVMEp_v&W38qe%VmU19ll{t8%Z?Z>&wM4g-JW(13GXXG z0!jzQ!>bJ(HaIi>73e4Uhc@+b<{ivc#98WG*S=4oKSN`&u_z_@VV;m5L@O42k)lUs zUUjDFAqs5=)5=LjAXcC~&|WE4EPTO6hv@G{(}Mp1>(U@4`N~WP=}CgIO;3Wd8H-AX zs~u1zbvS}ZU1S|v#9U#VrV4RCn)>-?Xm82KgaisH9CqlJvJ5}t-|#5DbYD3d>ICd# zA-3Ekh(rCcIv5dYZ+SxTS4u)?PnwKv^1pHLDqQiQD)Yh#dYCfYF9=WB>UPNy5)f$% zz;w_UKTVWlV<-UIFhS5yQjDH50}zWJ?733gEzj9?L5zr@)v!URPr8hY5cO6OcVZ8N zw%z-nW1}b{hD5eGD;64PaH)t(oX6^5d&KVbxxy$Sri((!nJP9@e#c%*n~w5S|8kHP zgE+vm1NRK`fT6c02Bz+!&qD}dhi;)E3gajyfqyn)yj4Vs9JK-}T0vdWept@Hf;GE( zAiZ){s1C$NPMx$#OMvb<3m&_k2;%@Q07L;n#Tj!DEtOj1A(=*WX_1n|jEc+-y_|?E zp~|<(Odx)UdQ!iq4ffO{aj60XRQh0H7#f3^<32iYOT22nz)3G3mVx?Mg*3hp`S z8^o&@sAFt2@ff+2&|ocvKof{`gt2M4N7IJ0hqwV{T?ml zM|+A_KOum`s6?Yk_lZ4sS`i3qCUryI>rq1s0n6b`>TjR+fgplO!R3bpLz*QEA`F!@ zu;^Pnok=MrqMCH13I7XY3Fazl6}UOnd8QjTgaUDP!lKVR1Jclt$*aa&30yI7dCOqR zpb&=!t1FymKq#UVfcHP7xwRk^WpOw`J3@ab-rMw`2jIkH+8_?REysC~tymCMWRT(w zr_2rL1(T)OAcg^iF0z#)f6rRAR1Yx#r9%nAAc0{Fcot{8eGeITuP^PKEShEc4aN-V zenvflgQTXixAZYWw2Y8`788~d@#q&}Qbs4%Y6NRu10Wb-2r_AL#89v8%S`pHbVzU# zsjs_4^B-#|C6;cwX!1!EA_n4e-}IB7as|ZSya6!%SLx&1-&m;a z_Mn}D-(}AhZ`ET0yv&9K!4V>KNiY?ZniyjRcjdSZFTf5f{tDI@p{G7$4U(kpQoW;T z8}0E&3d8}5>nPE|RhnkPz|z%if=O%v3t|$(8kpNp%01r%h*JIwC5$8D>;}bp zL5uj6XY;K7)&;?&knMdK1u6`}(}1rs7-ysOk(Wbf?IQ)^;HdZq1hfRAomXXYz-;F? zu{ID!r$gxAqswfXmRPD;2f#CyuM9P!1h@&~;n3z45c9VA8;0{UI@if-%!pAi-&LXw zIX>xP^EZ{1Cju~8A_ZdVByrzzsqe1!!uW_u*x^SxOMp10415EON#fERuNot9u-fD$ zb=wrglb#TU5O(6QV3`w8KiLvm3K&L_0Nhl$XJ``|bD%N+e(UR{KAnYV)ci}+_1bE4oYKA2=hnuQt& z$R#U*U;FWnH`vGLx>w^WwgGwwIe2Fq7ySoXlBx9fvZK znwmxzN~{;4#ds^6sSLRq8f}}GVJM_xWR3E1itNBtgBHD=xy!;qkH#K> z8JUio$C~A_T5rGHI;n==*v_H2t0zyR?VwA~D@^7kiU#buF{B3`qE#yp z#c!WrO$55xB8b#x{D3SG57NK0xL0l@gPvtMJG1n8Gp+3D@k~SrJyHf^ zWlL7gO9aVv*7wcq=Bm7OAX@L2kFn~tc%AFJwJ)><~+=>i&7ROR+n{A;OmUv5M8=WCiI}6lJ17^9o9UKBT)wt-ea9 zV+f(I)17m0@w+~wHe{Sx1iRBJpassmJ+NHlb%YRGvC%D&a#LA+{pPpg0mhgvSXIfk z&{bxMv(V@aJi)e7$ze(aHTyvM$T4a`s$YNg3D-w%ZR=`&VQs)(*865)xGX#fjtto~ zYu7#A$?7EM8}%XbMl;LQqTxQhbGOu+JEK_(i`p6)B>hNAvYgG##_eJ$(h*0 zQxAe=+^Bngm*z7T(Y5weuR8mO;tS4)DD_t%)#A*F6`n5#6=tsLBEjyz&O6?`i%BY* z{{C%fiC|^baOs7?jsusW-i4v@Yfpu((ViYze1Da7gT-A$^dlf(CGD5M2}eD6e0v!?N2y};SMn7L=~VgsIFoMM z!BCyKLJ6&&tvA6Rjm^9@d`B-`e**aui-w05genu?h`kmqtcHA1=Ki6wKytC-teX9k zOT>~b2pC{@sE@ByNT}v2Wat<7BJ@B2l2~Ya&r&&L-Pq;AG^_`Qd$=F{i%-~?oql4< zt>h8A;Z0yP`~liy_dE`lugz+GISuWEJjYW3;8~xsO#{q*RXPSAgyfLo#Tx)@Z@XT( zvt~GUKs?A%!_6K~jBL>b1)`%5a;iFpX5H}?u6>LROvG!obw)%pa@~p;uFJX5zOf|E_hgT)vs2@Wqter@FP|7-rJ4)5MA9XJn zHKo1^QLm8!b2m0WVqFQ?nheTn*mG}vc?KRB1#Vr-^Q;FrePf0E6nfVSZIe4aD~Y%8 z%g4+~PwStB;iWq;k?tLyvnOVWds*IhR}*RNKUnY&#F{u5ll`+dq!c7}iv!6Ure(GE zXoFh!eC_I|MDC~EhQEzo3(9VkuOS{l?kbL?d12`DrU9ael!-hv zHM59tnPDxfn8-PNo10J`MB@_|kIb_~DgDuY)W}kzQA5={o1&qPEXT{jzV|EW=$j4mS7+UPfz;&s6%L1%$J!IS$FX+=45yOo8oTq4b88sb)CW#c`N-QM z6$@6WN-u)RB-`?)Z(6I&1mT|!6SIB0FJ905vZA%RInHdyAt&ZL+qxfp#13kG(drt1 zexElVcG-IwG#CAKRZ_zq#U$`N+XJ?9@n|MaMcH6%j=Fi16fc@W3BXXxEXbMa5s znCYFCQ&EP&(&!?kzHWu;6S-eusUmyl_EdR02M>|l4%LFh9z{I4I6gj$q=RfkfC}7} zIJ%p!aUWfB?npmQhmiMvBN3VBqk(=ZB4vliOKfH#)AQh-sG3ZSiG9n-#ups8QNOOu zLH#ud;fwu>_m8&)41?34;9$ZgsXB+J%mPt%N1F#58{6Hj%^=3F;rCjE((AneFio7P zJm=L6RUxuE>u@6TUPUd1bjEAJ#kAxygS%teF5qQ@&1Ylu3LycdX_Th-nwT+KKtSk@aq{& z^~bf4*uU#{`BKnxX@gocyq>`Mc@(mHC@gkah<@bU?PfKz{U&o#r4(ArNj0uY@s7*= zX&=kLRWR=F z>w9NwZ$;_eh8~UA;k|l3)alnFcLG&a-7Vp=yw73U(4vEkN%9A?X!mGe-_`{CAGCMP zyE1OS*LA$=zP-N}P=RQtu5UE;a^V|!!l5wn-O9=Vo?O2pi<3}b83-`@H2dr6evA*= z1k+|xhhTI3dlSw94N=fZ=*JB8FH*cZ-eOMMdgUaOjplmzTS1R6{Fi8kv=oC_6W&f; zMIAxth&JaoM@L@@kKQD$LFsv_+m5=8TwkRbyvlz=u8HqlBr=|i;oQ9Oya*|7E-&b; zil!%bv&}&Ck`FJRPH#%v?$|5pjvsN@wsyHQ`X6iJpMDiuJH9M+wW9X`>>5$NeTb8@pS?u~MGeUnUPo$&rc(Hon3 zSH^Iu=y%zhBH&eGb477oRndZP5m;!B-M-a=}g#t zfp;&5q)1s!`C$?2^~7$Qr1Q5y$p*9}n`!~CMVS+&tzmsfTzP>6%Hy%P0s+HYy>Sk$ zOUOR3a%o0XgWEkxw=`y?*8pp`?=jv4G{==lJ$!L!CxS8q8Qp$ zP*CT)x-t&p?}4l0cMAskN{>(Wo6e^j%XS8~x8_viM6*eE?>`z3TIcO}5-Bv<75s8T zYtSZv)XW$X9w7}>^25(nwW8Gk!VbN)p)2?D?=)+85BM4L_RS6>PmQ8CZG#_uFLu>h zi9QItA`oJI`uScIa*yn~$lE?K+5Gt%#PR9er*uq$;Kks3pYEsSJ8OX@FUZwx`ZQbl z=KJQPuC|Zj_GB)9CcpExg&qH!Gfk}!)tyuxVm$!z)*s=fZrN8AZ}^ha;?tYM`anB-mmi}U)fd63XVT9>A4 z0_i7=h+jR)pF5_!zC11|x=Bdmo7~;Em+`hdVyCd4n?B6yyKnCv_r0Ns+fBHNbK+}j zyBeBxlyp1&oK){w;UL&`U-NR|_CYs~$7{5Ax_pQQ^23SbT|u8ies*nA^7s6V6%R^d zPBjB80^FP*0aeKBkr_bEvHI3gsxMnxNg}H@_2d8%Do4oX8~1WXmww$U^@Is@72cQb zbAdbrdk(QzLR!tyN$;2mV7W8Q)j#~YbS;(h9!|qamwSgLo0{4uMV(rm1U~%0oqIY3 z&ZCZ$m!aBA@NrAjZMMK3bs644OdG+Ytu(p)JYoIAXLlycELo%K?%P8?d&F(w=L}$4 zL1;DsgQI<4>uZ4~s(46FH_Xg*nGkL2-9y>Oo03V04Dww&V3JeDTMeO#xL4Hlwa7dU zpH$~cUq3hA@Q`&pzh{2-7k7&~_eUm03UK_@Jrp5;vkAHXDbop_#&X`?5)t@yf@L^; zxgk>)mw;Nl;DA<}CQ&CCcU?UHg6D7k)!O1<00Zi>%%kQnx5WUj5U|!S5EH z$=NgU+}TVH|BI%t42!Fa)rHbxMGAZrhvE*!26uM~#jUuzyB2qMiWi5$-Q9{826uND zoV$C@J^y&vGnuTc$eXNW?WPG@J1e%1tg?1TE{7;y_*Ci(#0sX8 zUKA(ig|_r06-W`k`g{U?B~!^D1mjyR&w`$$s_wy*ttpOtW0!v^Vw~tRAmMHL z1P)9pA{E&D%5_;U-mIwb=QB62%v$)cDuJKV!4c-Sma}00Cpyt@v$40DM;Z9m2x7)9 zRj=AzYb5FA_qKds`W>G-Wwt-2KUfw$mUsIjUHkhd$~*$CjJ_#LYpR5IJN^2akrjC@;XJ$1!y38=mV?`< zm02mucV=3t1|#9vEi1d{u?32NSj0A*`r@XgOtECT2}g?qg4e_VmuD_&(*HU*lM$)@ z?B%Evc-pDEc439%8OXqrJ!duS?fcDQ8De;d%bXUVcZy!vu`8W%L=aJ?6lff#d$m|IA1C@kvFbN#1~oD8qG zPa#56Uf#LY#O~6#G}eRnS^*Q<_v&nUtMedNQOFudl0!dGdNre@Xv^nn2-*cLBLWr3 zS8hwz%+P>#(ou8gI4g#h1+W#O>&pjxqWn6m%p>wg&V?p@P1M$wSzihbKQ+?mkE}Ya zEytr>s*q*e>Dn?2loyR}keEC*(^0{tKk)q*FO5*j;wu)LHz-?&M{qa1?oE zQ(GXh-glUamK4KAWup$mu3AIfRg!SqF|>ZY6s!y}^EASo+3Cl}<9d0y8BUH~@Fcpp z>`}<^eC}$DkNJau1xB`RB4XernlUSUkGG1AL~*Mz+S*wP8%l2`L)p43aCe|;(CueVP8?ya!%T{}r}BDvI?yLJZLK24u14^J zZ%m3p@AmX`ijlg+$NpYZo(2w9jEnSFlA*7{M!BYnRawTd=wlR1ZG2m+TZ-T3E)&gm z7gGCpsUmF6%NKnR4>>7b$%KVVe$~}#`ojpzB^PvQI$y7de@U zLE1zfNwS~?JM@3i`y9cR=-+yjD9T&lm*kMaLd%V_p^wifC4D{PC&TTE)$)f?bi5ru6j4zm8nV4_U04(@>e|y1Hbr9*Gd1G_u_XUZ>nzvnay5=U z?Voc9$KxU-%fuM2e0}xmHwO8!5x$tfOo!R`{^A<`5&*}jT3_FEOIuM|`hf8OQ7gjP zbaaVLdAfeJX(ay*c1Dp#da2&Pz#)*Wim3beN%8DduWWhKzJ}GvTWrwM*AycJd5y!_ z)#b`G=*6kpiYi`*h0XGVJRcjT15S-v|;1xGqy0ew&ZbT!8yMo*at2@wobR>NsDD#`yt&;m=UVS+)^Z_(@y7<*f;o)g}rR ztc0}WEUa&}t>x~*xka7~V_O4k2-Z#O+%EB1QslX?=R*kJ0V{9lCKAQ{#M={Q{y4JJ z?m&SbKyB7&Niub^G_!9uPRBr{DLz;zUde$zt{XXfSXf(>UNX7lmY|a8C>7Oq_rT0l z5=q%|*XNtt<(`xg!h_Pck)ugJt!6y;$w{*&-`zF$$z}ygb<1O~YK{=0>A||gjojR~ zo-1i>dL5p3ASg0yAMBfulGI!O7X@;fX(dZjjoRt$p)5j%&b$JHQNw<7RDpE9HnZfu z!{|+S;Wm>p5c*k1@P{9?I9;~Z8u=TY=;ck`DS#83SKgfwa=QK9E>z-UHzTqb>5<&x zsAs>N86aUC;>4(!x zK}j0!@^D8q6+Y>ivqa=AZ4|n>(ZkWCvO`=v5}C4RwYVxSV~ELkYV&+_;0mG4Y*?ru z6m;rNto=%bbTh_649f7~uD7uQWFVn$(a~6|; z+v;f{w_(W56R*B_C#FnzyS#xtZ?P627DA|<^|P%a=8(#KD+uC4tU zeM)O63rPjm-4Jq}^mrmWTgbDz!t>TD{3O6Rp{-z0;p3`a&i8D5oePu3V@sx=(baRO zIVFTujvZUo9NIpem4djcLP|n`(Nf zH=HdE(@5X43oFvj=0P{NcSJ^hA98$V;wp0Y%XeJ^tFz@sVGl=9l6HW^wR1F6-bA>+ z(BsE4UBVWE9o`+T8wzw%BOHw^PV}_l+56ZD(9#=+f#=y#^aN|vvy-RrGvn9YkSCgh zZ3ACj;OfH8J9@2)r8BmL^x>!H>$KqCJHCE`eiKVQ4Wq!Z!Q1F%n1UA1^8I6ZSH7l~ z4YanM%CGB+m}&?6?%{4_MC|{10XzV(Ix1@-dGH<1?Y64HR9cG^%B-oV>`f*xJH*^5 zFcq_R_3ONi#SNK6ua(#cF$KKlZjaE#yo%$Tdw9m)S@SB|-@M}N*?K>rU412CLyHRs zeL<0Ra_T`j(*##dnSIB0;oxgMY)WH#vYOFAdA)p$jbog$%+KyplX)?0HmSpF){7Cp zacs~5EtfUspOEiVdT5qlO;T@7%G3#r_{0-R*%L!ZBFrAR`?G~mzJ;jN{uyP_K{EGNjBzcU9xE!Z>QR_(!+QS-ZVOw zfm-^ePMp?7{@S)m8t{S~-JiM9rlS`!nE9`{Tt*@Lo#kwU*F8V2q@&KCF!aQAQHSVc z9$sp7-BwWnQcj2ofvfy#y3C z6hzx~K3YML;!}ZQ@CvWXhsY9Vbfy}H$eANY*BpV~@?>gj9G{PABkuBNOnSpvy@tqj zSC1=Qb>)#}cQ(88Wr>aAAm+s2#JA>CJA3ha%XwR zu2<1*<;#@1?cAc)t!=+QX0DFoC0>Xd_ojMv89_6ax$yH?bG6{=jCUK|-Xin?s~ zJCk8mS<}{AVwaD}rr-2dVPrJL!IpQacXrvRl`|w#NHE&MOU#fCbY!9RlGETAm7$xw zzR!u)IT8)wZ)q|oJ>F+bO*D4C5R@rJ2>qAA==^#wm1Oj(!`a#83R=xCUDU+b9#VwO zO&6lb*!DAUJWF8TUMIdq-248umotwU_GsLRK$pV60wb!p^u<(zBS(!uwvRc|3Q{hmQh7*uwq@6`wlSiM5YqQL7}+S!;kA9C^?Z#AO~}l_#&D>#vfRk57L?A!SQZL* zNBLZc4R*ca8{Eg%yMF(DmVL?N1g5P_hHF{JNj~V)8AI9tQRHszDb1!`va%qd<sSKJ91AMI38alATc!d@ z{>%|d`EHCu|Fy}RC^W29?!NwLmh<;gWHQ+1&$i^`fkY z4K{~OG+}jT_g*CRx3d5|LW2jnSEoK|X$852M5;#*S=t8cVPkSQI0=ORtHw=e2(Z0) zzIuByt9IV`vh)4S>)vI-!%38p@WwA<;=XWj~T%}5B^0n zDuaC2R$5q4+aUAC+hn^5Z|!x#Hjd-Aj)yCo%)z2iz^mS}x6l^R%k$nJHHbx*l@oQX z`;j$p1lXI%t8Zeq{ZJFKXUSd+KWkz?V;Qnwnz>N=i-gEHaC4_;$ts=S*HJ80>6?rL zG{I%OlxA<}NiT2YCivDjT7v$!Q^uSJe)sGjAF~c>ruk~*736`rzw81XM`#IHn5E~I zy8{tMuuz}ZabMfC2CG1a@2eLiGD1;KWAunSa(5@(*V6w|rnJV^;)c4eMOzGORYR+;`Z4PC z?n8KsdKmnVt28oSrHfD6u_q>84@i@)AhZ_asxi8)kS&iONnk9m%Vo*w)wf%T?oGX zf72(}P);qgdOyuh&ySet-QyK+AMF{ytn0W^V>zCNqGzTIysAig>8r!Bdv74*@ z%qaHkY&~4mfZaN&*+4iB^#xaaW#B7MYIXVpKsw@&S*cu-oXn?7gCxwv$jyv>_SYTS zf_Td5IAh@7Ru;zRhXs62u7bq--8U&(R)fE}&a|E3T4?bwI7h92Sys-1Ew{JA9aoK{7izmtXS@4?5&qr4TW_Nh2`A3#wy3>ytHC666TsTLi*?DkFl3KrN zfFB4i$ip3O{#Ci%ACTy*sT^MC`->{lSG%6Bup8!eepeT_R>t=w7iC#lb#H$;xJH`Y zCttN^W8uy`d=NJtq+p@-vzFSm&u2Y8IoX*TaGG;zu&=n{Gt032ed_|xXi(5Fc-TNA zBV{6Ci}4_R_+Ozh3y8Bzf8F3Zpy|HL$?KhMA47BzPD>PPNJo1YokLeT5H$71*lh?d2ndEfE%rZbmRgyI)^j_ktV zLg>P?8rb>UFI*AmM&Rj~JJrfiD+yKgGuz6cc2T}aJl4A@U-$ubT~T3ZBunZxXve8$|yPZR&c|v`ubRijg(_mt&?$S^)VW5@iXRBrtV!8@c={fh z{h&vW*SMFNu7U6+@JY45as<-I9h|a0_hsXIiD~0s7+qZ?|ixG?a$zP(;MD9eSfbJF?xN^#>>50S0eU_{Ml9L!Gs_5m= ztKRuS3aW2H8)j~!n;Y3|)Ra;MLG!+XC>o|tb^Yv*rPifQtBu9(DetA40|`nPYF1r~ z^OWDQdm@gY=Z%^M^|Q53YpE<*GG~r+X;zChC*)V~LBKAld^Q>>l*0H#!8@S>KqYMQ zoo#}FDj}>x-yz#bHdHecTAhledUJ<gax_RsyIESt`GI~d`NMh;w~NTn$&@lD`|fT zbRSQ$-vBBuhdH#6fW^M$XoM8+MJK`_!KNn}>AyVn7SSR7IUt=nkMuPIDd(6bDa|e$ zx7k=LbiS8DTmgYktHtTHjwGDSZ)lz;tg3;tEDrJ2p8s;9z77~RCL7gzXF!N+FOZkEiA2Lz8{_S(cgk6eB7|c8k3%- z^2UzyS}K$4l1(TE2UL=HLo}KQLN{@@K4}9z6FR@Ee$_UlJG6W#q%K1F8|d6O@8wzh zwV5YnISKh4h%|Dqyy*d;S@hjR3JjYfn~7@!u6>x2`L@0+CgFG#_N-}_&DB3!;;OQJ z?9M4VTRae0|4x9%?=-IR)AHY$@$)+yLuk>pdFWfj8$}^KXOp5%cZO!9gt2$|u@wtp zK#@(Dl|;FH3Y?<;Jh8Q_9@VLVb6%h^^AwT0(N6Y+*7Z?9_Vdj>$Z~v=-u44sXwU1O zOy~8W${YygRVQvR#q`zXn8c$z?}UH)zI2f|hpQv9#I*xNW)AU1Gs{(z_DO$<53V*!K9)c}0<7U~Z0Nt*t4#lNj{RI7 zjSGe0+$t)nbP?-s2F`eg&~4CK6%`yavNZBcxTmQILjiK5rt$Rw(?j=Jfg7o23PIFs zuVcT~9Pw$qi=|5rKYF|^?}Z3IF{NgRyjfQ-ID7ENuZvnv^d8?^Dri513b*+>IW1Gbtx@r9 zi+-TvRk&)CxU<|kS8Hr;>PbJ2NLLz3E*ZFX%Jzgc(>!64qD#@5Ewg2YN`g8>d3=(D zLLfn zx=e5k`g-N1wG1J`AwUwXyutI&#(!qU>y-o-oVjbZM_~F3Yy1q^wfS zJn~xroE1Mw2q7tNc*VBQtq-4;)^zz0d-kmI_k4`q(EZ2V?Kr+5x1pc5nK}+CqkMCg z0;|ew=xy5^C&$d;AsGit`h9bAShS|}19uKrO?YJI1v6D=?S>jvZ1xvwRr~b!S~~K# z2dR>+m41~;t#4OdH=Pxklr{!8%4vzQvsRl=+-;Mk!J31&#DxIl557D zfM~A_!Q@+Qg(YIH*)2d40U}u%Bbg$pse~s+ncc>kem`aF9Ou`?kY_ISvHBvsPEu_f z##BZdcU32qBd0QYzfUkIX={EN)3w7jCstl_@OcYQ)G<;9mrHJT2WW(Oe!lpOKVIb< z4*$brPy@--y!TL+om`iPtziMJC~1keE*(wk{OhPDhmC-?Th1{QA2g8?`o`0kEM0VN zzgS-+$DGg;O&*YN<xcG48=j>S zyIkzlhK84Jx0P)PD~;4hLU%~+>hCXjlTxNyUzX&Ox0ws6C=;#vr$v4&^>1$Em6iiX zolMIWouT}>Czd;2PkK}jHVEjGek54e$l;OU$wKJl**Q#Q_)M4jZF>0Onh3pQQ-o4l1$H}FSAXwj2m;`z3V2LO}%^xs(!!X+Flp) zG-$hVem!(~uAP$~SS28+SYV+n(Ao56a%G5_9-r1n^l>rnzuYfOTZjHg%Pbgy}HhpovR*`d|} z^IrJZ*z7>wOd?@rNB3S?f}8q0iYohZ{r&MFy{3Q#h$%G^%fzhorXJqgFMrec2p8|C zD{4Zx#HQTYgDA89!yKG4mJZAYdYk_CZp1ch6XoMZZ5zqprXT#j-pgF5or%|}W`%Fm zj~Oktf#lxRYqiAWBpHba_CBjTpuXJNNcHOI&S}S?AN)#)HD(BuuSn9`LQZa?++t@_ zb3L+TM+q+YKPFy3oV0rFz}&Dspj=E%e_byYrcHi5i~Xau52NbX{&P&HqIBz+_tm`Z zrDtTZ)^PBdK6v`iOut{}17DG@eA-Kk&}`g>eS%GYErT57(0UPZ+ESN*gjeY-4uhyd2vR!bFmj$rnLz)qe%HAMO%(@pAQP z5O$VLuxOs}Ms|Q<73MKhyc(65ray_f?XUlNX}@Du)!9+lI|r@!%_Fi^a^P6fJ-l?v z6CPKbLq8pjsV{0ya+#W7E90h4K8LGF%PJ(p!Lc`hoHoWCm|WQ%!Bv4XF@tV)@wcf+ zF%)Oo-f-LCk82ZJURSHpijEa*7v27cmw4VLFOs|YKkS3qW`bIEXd8~f*1!NV?U2@i z^vJ-qx7pNnqdFK}p(O@;b|2FDHutlxUDY*kktwI&9#!s}M9Gk`f);G#kRmxHRZdj@ zf8bfRf=8}7(A>AhWib_X-lxgzAAY)cdQB`}eByT5G_%>_DUFK1TJ`=1c|pq(7(_?! zvTb|W1e&YZHTQzi6YM2gtp&=G9z>kied&-K z0zSGc5c38||J-gVIre;Q_hJjxqWgVUuRl9qJ$3p+UmN===6ha!uLW*%!1u9{`g-p@ z#WWdVQ>GB*W{R69je&(cP^?3k%-L}F`EVP8gN~nTl9${?;;6rU zKAqvLPFRQO)NFRyDj@m6vt2KT-UH(5m0(h1KZC+;03TvbNTKxpSN#I~h02C7_oRs3 zsAc*6;GN*#6ebV9Gv_lsS5c|l`L(5XY8-419;S{+QQNfhbM297TTTSoj8+mJ2nQ zvGuSxFfW;&6%GYG=~5Ll6AE=kY;Q*{Bl&$ z=dWJnq0Ya^QJU|5fPYR81)6&Yit1cvya$U-5>xT?SY}%H;Bz|eV@~fF%oqk*ls__# z9_Z5CXwkN3Ej9c5NDY|vgg7PddRy)1DuG1Oh*Nt~7yWQUr@J7$y2R+(=;uZCry0hJZ(+cgbUpW|4xBBH4#&zUG1XEdnHm&W&Ttj`U zShU&2YJ>Iy*rFa2TncEM}qdWVg-eZS+)C1mTCXHVXzdN1r1^3 zOd~(JR3=RZDA`m(YwqixWInl1-;Y%$G3uetUEk5sN@OB70oD|wOb~2((WaY#J5Bo_} zT;ZJJCM^Nq<~M8weLiN`NcQQu?^iFnQ#0ls)=-xEJ(aq}c%WAc7VFX;6-hb*mg2)b zj4YUd^gk+`nEg#|&L8hBNKcoC+@S5J-vj!l8aG~Coyao+vju0&mI5>baWOFD^bAGdAYg}=ZkP?N z|Jd#!x|dT41ofL&E;$-A3;u1wP^_8NvN|K?4LvqvNce^1iWh^%)+YU3>lm{Yr>WMn z;}SfA@+Ld|ru?Co=5Sn(zdmw3XBD#lyGT6T!`pDTvVUk$dLzW2sXBDM{gM@u$#mm^ zVmFc7!ymaQ6Cj^gqH(YpHpcu6yoFJ}N8u4m#>xkF_>dI8Jk*-WFAk`iXmUYfIw!D7 zTyK|@XY4N<=Am)q=(z~VfEVj*yQM9&QPVkHDq7)i1vO3RQ|4p5L!yqe{?5GgH~V%! zd7%|q>b9$8zXP>k=ooAdj7sL!H0>Y?dYH|6-D+FgTMp=M?|}{0kPkQD18hQ}S3`AP z77gA$u9PIX^-0HOObIRQf!Ws1=!!ZD3QS{zOQF`5Nehpdt5Y@=dL%z?Wa7m@A{y;J zM~H&_U_5<9M$>PO0wh}Y{F@1LV56wH*S6*4Ctgmge_6nNV}nS`)+*%{KA-F(0R+ zEHVgryb^MWeTfti%`Edbw*VDmae6jJ%J~#w?HoHkxHUS&*pmaPr>}IW^9#*snvNoF zJfbDQi2hSuesXT}N{+22DfOfnqquM6RAr`P8}ObPrRedSDy`6l>;RZwLvM-%${qoZFGm744Fc9 zt`XM30j6(UF`^%B=DRF)j3)egDt zGQ5w*U;Mw~f@=vVeld>?zV~kD%z3;b&?O@<_+YKDc% z#&#Cc$7sffNm&GVaQKg}+`06%d&(uDHBCii3@il(YRVvzjE;!jEN=_zUZLS)s5yJH zV@6cAwOmm{)wSJ!ONf1eCYmSw9%<{TC&g zM3xLt%-*U;b@d{Tuc%o#+TybM*#~z&i^)k7R&tokV}A!NIu%zXobC=C+TXq1faMZz zJybL~Jp>*!xo5(H^Wvf#N37pX|E^y!5i(?68is*2_-kQnwOiWMR8#WGm#zL^6C%ma zUIU9kZY)g!Tm0Ff&)cqC;SMd*VbKKaNEVeA$L1xo$30dLTrQV%x^-UPFU;ivN z2ZqYhe=dyPgu7N7^QJK1GM=rg*Ze9uM^kyL`6attvVa>{2*sa;rJjHKW%Z)DEO+iN z{Y3(gg2zZIJe+*tyD`@`US@{4p6BO>bnew_&+qHeuV5xE2&7p(; z_7A%UulcR#!Q7rd>QW3_7K1G9P8B848W5ez&DN_F2LDdo=d1%o{>vnN<*maPk|*SM z{3bz&lDJG-%UZOhOxtdujo*UIE~c1d3m8Av4_3Bjc4@n;FI-!E{1`n1?pC7d6T%Df zvn?wJ{lk;nKH*^ya$xfqh z)w8}4R?SJyvIC%)C0l>jagz@=kSRngNI^>8AxOU^EwS@e|`Ek(c_ zV;=z-YHD$%2kyvSCC<{0(kBH`i8-nj)OP)pAhEr#oY8tiYfdbeUASSYaeW|F(kG!q zjCX4^2bwo#clX^(PEZ=w)G{WRESEDaZk)FUkgNZbqw$-aX;vz1HXM#oq#D4M4Iz(7 z|A3XwYD*TP>}Vy28xHQFszVx7%of5@U-LS)smifP`aF$DjVopw$=5mW(^Ru|Q|M$Xv-i;{_(nIeXIW%Hu$JSCd1ZIY$5l@1m`VtwLeyfIPt z#)GCiUs?PZXpy98P|^AVAiMpCar2Vozps;g8E!N_@D(ARE2sxynV*p2ce~zY5q=>p zv*#h?@|r`HjZyp}NUr^kycgr08HgY2lel@)6qAY_j7z)QdL>j&$s|$4BE0JhBKf_H zu9%aJrhM}p_Y3&Yf;I?BhxERUERG&d_*3xoPw>NAXYcs%YwjVU*?Wk{39S@l5E#8@c{Dh#RuFi4HrGd}D zNI5X9EykwgVQ4-YA z6*b-j5dJhB%s?PYTNJ>C@htuU;46M?iJl^_p$cJ!f(ane3p4#INhTDUf(^sP=z-# z88)r%mj-^R8NTXjST!=oqH^2+b2sTFCP1jS|9wJ|GLbMF!5dl|r5^Bk5y8K!+U&3< z+h(c#vcR_v5MW<`J1l_D!s~+Q{`k#l?ZaYYvi;j$xvOO5WKt8*D-l$7{2y0e|B^_t zH-ay(L;B@`uOh-}+l5*Gdmx+|4Tk&zCh#pOUH|&>1D#JkPj|Hr+Y*h|OT7z~+66~YN=nLF(+T`FXeZR+dV-V9VM{Dr z9I+Oc##?|Sl}R_hxHu#d=k4w8jE}GEQEB@gn%ZCUeUvJaR5c1T^L++oc^SxyQIL&OJJOBovjmuO(HoB#3Gwrvq-Uf#HfhgfN)+}2RWD66TN zMw1Axy$dweY;`ULH%os`R#j5^;d9Z0uyB95UHNq0wQd9G{bZ?@SqVS#1b>3Qpb4Yu8`>$1~qTovs@YKX1n}O~KV7IWa(0+;^ z`qlg1KKD#iSQz%@@s^yGHDOWTFT1+yRg2mF&2={Cx71RFTFj*V-O19gwYTg2Df-A` z<8Sfk&+A_2{KTFIq`9f%wgV^DDyVbC$cb(~z$!>uOERG$J-QveJKldsU zhOt)3TtDu;GZGwiT5*fzu z^L4T}kLJtOY;CJJTu(I0Z6J-X+@-Fk%kwg_vUO<0e7_m!(tzTcnzVFya#|Yh*C%2u zEF&TTZw+S8eSGCgK4t~6Q-^)n-8pqvHWI+K&#_09eyQa}zSk3V?mY~8y zida7NTmc{R*<#dh-+ zkfzv^6Km6{oYVMI_q!!C55Oi%zZHb&WlheGazd>);NeQu_au_q?xs`Yb$Esl$i?-?TTlM@jMgiIwTCH2=DwW@UVHP0bbrA#32 z(r@wQ`n_@k`{e4;0cf0~qob6xG$|DoyiwFp6C*o&Qlkw7?6@matdM=Q;Pf93PMzgE z9cq0YK!7PZnq}{tju2yiWPIG&BO5R{u=NtFR)ghkvQ5Re0{9>Vbk{a^Wo2a=TH5I! zyoKf`0>8m*1pX$Yn*H1g**pVLcx*aa)TE>V_4Vwr?Yetgf*Oa*4dvQUA|CgPFQI5e zGr)EAWBeNmOAH?r=&Ufz{x(>pjvdRTv;KU%J$|yzB=Ok6So1t)OkL#lzOR8_fhsKX zD>axHMbW~h(AZ&^-CvC{>v}!|ZmQ+H^U1q&wd;A8`Eb5GRrWYbz())2%uDu!`1cjS zlUq_#7@IYuaiVC*?R;$&1L8Br*!0zS_*3cv5-RGv|5J|kb(c0kc?rh)+a6h`%kC&P zD=RC&J{{dC3uL|B!%5HkasbdZGBS~%Gi~jYhjm^PqbL1crY&t8Wl}B(SF(Xk=^Zm0BE@^JU@=4h}yTN2aDqgoEH?AgO?@&ej@9 z^0=P-4sgkdU3oF|FPXH4`D{rj=1opP5e(==YyIT&!*z@2ZCNi?_2hL9P3DJg*yx{O zcJ=1dy+uPMB_(x0(EC7*?b4mI@1qSGOp_HV9u)281X$D2+i%&=F`6P?{rHWJVitJr zYYO+3gsYkb<5IxQ*GpMRNocW3cVuEh)!&hP|Ma0_hEExw&?gi!DBk~s&u*@iw_f^O z4rt;G*hH3#>IiuB&VDegO9-(WNi) zs{w>;y0!5MFwU*k+PS;lUd~tb1nKD`o13`+Z!rLJ+6g=g_%sfWD_Ouux-mBT%*CdPC;#x1LEntv^{y~Y7`%bYb(JZF?tv2;xivPUN zhmW7#cUly3_@o^jS)r?LB!G*h;NVCG{LN$*cfqfqK4dQeH`ArRYja%T_Yxu^@7T;H zZw{P%F9&c?i22e0mX791MU|B?0nH(2W>(Z^K-3}59pcvm76J8F#iz$OXB)i8Tz?K3 zNu*X^T2Es!f>-3f{f~u(h23&4Y_l&kF*$kQ_2o&ob2or%hR%x^`n!g1`LFY@%t5*S zr{3OPTt4@&9Kf|F8{Gk_k4+JgkgdT*lMYH`vG5}%tp^-DO&9N5QGNE0<1TVi$R zqYT^*p=|k99ll-G5=?v5CDtXs5sJ-ny}^8$Y6XB9_BqL)Ze|sw<>V;n=pq1~>ueyP zp;cjaMAbuYBIN;udmcV#3{H&Fc9NF^N#~=va7x*fxXjEkS>}$NYa1DP`6AVFwf2kN zkJjs*x{wy_z^EA$hC8F*%g>&a0kW$UlGHs_2mw7xKK#|EN>g|ZS7BIZK|)yhXipxvE&O9U1{lmgByKWICB8{p}lu-xFj#O~8kz zwzgJpHRk!*8vs~CK%i@}-wcE=J|cuJjQFdM0v-n@Ezbe_lEmeBfs%;q^9#}`$j5Pm@fi6!qg6@S~0G+Uv70)<6HT`@LT0($dk9QB#Kks0R}VXaBM*H6>-R*%&n_cIV?zr{X`Eze~$^z(Haf#MNrF9=X~b!{c?WfK0L3t?|8V%<`+8 zJaBT24hkHw`Zsahk@f5k3BI!Y!+VE^XR9U0(VS}=fICN?;jn{TS1AVHyYK76DIDQ` za?m$0AO&6?;e9nieX`oNXeTg*`0-*jz)*sz4sf#bb%gCKS zeP1#5>O`FcKY-qERP%fmqdNn3!1>@$z;uD6{`DK9erFcovj7mh&1uHf7lP{K(E%9G z0WlZ$wL0~4!P-6$YHR(a{h53zlwon0bbfUE!*cR@(K0caj{f)dfTl28EdQ`g@^&!8 z^WnpXf59A~KN=e@@x%kvJT}Obh1y-t!z5)L$xBS(I1K$Zt%TI%x%88R}^^YXTBTD&MYXkuBs&o?d07V26oCDUhav9tI>^D6<&BX6} zyii%8QQLd@w0jh*L;z%LKqBBQaH85#$@bP7U#7bAI zq`5h5XU8}qGTS3BW+Icb&ilbJbpeQt(P$x#L;ja{-lgDDvCWF6kCI~6fOMTavXkl6 zSuN*ihJ1WgzJKrk{jESpj~N6~Ll9}XyyH7GCw+}8WHF_RD>ITQ)DaU4*5)M11liyh zNXG5~xk}96*qIsS!6>}#tMdRda`G0yQM!B)D&rUA(F-H_`H0+_6+P?*#PZz*Paq!9 z1;L|k0g0Tz>T5y*=3oqwtYz8pTMQxKRXHr@5bs1ce7y8-J_4a>FfFm0>jyfS2%v@O zEJh*P$CJ5&xqwTnHRywH@|Dxnl#C@V@_T!g%Q{$76yX7qh?Bl91YEPJNV-fqnyd-e z%_NI5i~t~(VPaygXmJV(=H8t`60@?#JAIzo!5#aP*+z#mMg4<=;2%BjnsSV``ojU7 zK#>;OG0v1R@mfDk`kDe}*<;*L%rF%j^_#n2wn}b;*jZZA0lNwW%RuJN`+ON&dR;(j zKz92tJOe3RJtXx{cvMs&VDCKwCw}04;V#UrSZM;ILAW}O`7j|MplCK&suty%^#&sW z_$&|ri?#NfB3D;ebhi@?SkwwNwyQi6F@)0p%^URKj*ucrj3;L!QHiQHC%0e$H-oL7PLTONyzOO48$dn&GOBvllv&J{qE0~LqQ&Cbr;)iF3*iX zh!_I0#*ggqS{ALBhZ;2RNz4rX2VLv5I)?)g)#Jn?GBR@Yu`#XW_Fy_l@Ocxx$9urP zvRlI_YW=mFzkf>qFXW_Hu)<=}NvET%hL^KL`Ew5RH2K5K>@1um3b1qIKo#KKBUf8^ zUy`3f-9)Y%S^v6Z4{qXzs1GuqV$GMpMY6GOx4m(r`IaB|f8yv}+-qxe&NNu5?@eaQ z7Dz+~-tKA`=1v=1m^vbkfkQO6Fo47c?2pWt&Fj_(WVOjZ;p&FR#_Bw84!&)kx-BPU zWm*5fimpAL3H6O{(}>MwY0R}Y=hw0E!=$yOs3y0}+$u$>HJ6$qLM6NAELXj4Pc%v>h9$+B7;<82302K2(6P%=^wDhc-VfLOP_=^DO!(Jb1N#;b5{VXX#pW8vW(Sw?Oaj1?K9igpF~qIXD_|3v&7IIdt2ja1nKSB)Uf0!{vWip+Cq_WPRx(ol&k7?dymf^zB;lxMn^m^YKm~`ju?spTtsUMzUK{e#5 zOt!|s;if!emZ`XhUE;k!$LQ4Y-ftV)zo;IJ9t~Fvzg*VAyr}h{Ww z&-U~+`}f9lWEO+NCeMFMEG#T^BoJy^T1vo$&_J9H{TqwS8u6hK60%{XirYNpMaXq_e~#s=VZhk=(3P~P9OgkZjB2wOjXxxXmE@m zc}I}fu|mZ;cw&XZJNT%WAi ze7)=rFn+O+nT!4TT)A2CCxwv?k7+dGH%(`ax3v73Tt&0SeGO?+uPsk1+{$=+`^oux zegAmuOZ(=XTl-wnnFn?v7Y z+}>c;hKJTNE5gWj0Onc`3hap|z;kK(b0ZN-@@HNL?Ds*c#%*THhN@(|vNQ!==S)5{MDO)XVvXsA z5R6f+&U!*CItdSec-4_sbyGJ4HzgPs8$E@GafoHW4~GV{?5=o9{XYGFxZ^Q>EGsdR z{>O&i8%?NxgC>v07Io(1Q7DLuUZPJqaUxem8(~idPGVzM7uABkoC5}NR`labuGBT8 zdBXH^MI~pKl?lY3z#Cgy`;Aj&gHI0sy1WR|YQ#&rcrpZBQcs59xm{+czyZ3){iB4J zfdXA*T-}Xn4o(4U<^)YrrCl!HZIfG9pw5vYhhQ@t;yHl+c9-ajoBcC>j~MY!zDA|$1SB+#hWqL|r02v!OL$#QCZQwjp9?4VDGjBr7iQQi4ENaJyFi0*VqBxj zjLV5&Ouy3eGD(BlhvT4Addf4(MhWkG;Wlvm3#vp&n!YFWa!q@oe%@Q~efzf9`^1S7 z9E!^!-_5cY>Siiu#od8D$Fg>4O((ojn9}pTdOMRLYU? z8qw!#nLp8_lj+Ja7HL*)y*$!A@{3WCv2I`=$%cN($s%;b|6WOmu@cKiYxteIh^G-R zH?JPv_W-s*!VBxg5p0U55Vcs%F&_lKt@D*ZKECouw0=5t<7b9}rj0mps|NVR{sdoX z)C2L>D~Arw9yg_}Uyhs8SD3yV?YCcTIwxo ze!5L>&d|(|0x{ObuBSr#a$(Blnha%8r>%!%<>6bt3aN-Lcxch!b)DS}Wp~$0;20#pmr7~a8k_eYH-C1| literal 0 HcmV?d00001 diff --git a/docs/source/audio_processor.md b/docs/source/audio_processor.md new file mode 100644 index 00000000..1a7bf8ae --- /dev/null +++ b/docs/source/audio_processor.md @@ -0,0 +1,25 @@ +# AudioProcessor + +`TTS.utils.audio.AudioProcessor` is the core class for all the audio processing routines. It provides an API for + +- Feature extraction. +- Sound normalization. +- Reading and writing audio files. +- Sampling audio signals. +- Normalizing and denormalizing audio signals. +- Griffin-Lim vocoder. + +The `AudioProcessor` needs to be initialized with `TTS.config.shared_configs.BaseAudioConfig`. Any model config +also must inherit or initiate `BaseAudioConfig`. + +## AudioProcessor +```{eval-rst} +.. autoclass:: TTS.utils.audio.AudioProcessor + :members: +``` + +## BaseAudioConfig +```{eval-rst} +.. autoclass:: TTS.config.shared_configs.BaseAudioConfig + :members: +``` \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 00000000..87c91d96 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,102 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys + +sys.path.insert(0, os.path.abspath('../../TTS')) +autodoc_mock_imports = ["tts"] + +# -- Project information ----------------------------------------------------- +project = 'TTS' +copyright = "2021 Coqui GmbH, 2020 TTS authors" +author = 'Coqui GmbH' + +with open("../../TTS/VERSION", "r") as ver: + version = ver.read().strip() + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +release = version + +# The main toctree document. +master_doc = "index" + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'TODO/*'] + +source_suffix = [".rst", ".md"] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'furo' +html_tite = "TTS" +html_theme_options = { + "light_logo": "logo.png", + "dark_logo": "logo.png", + "sidebar_hide_name": True, +} + +html_sidebars = { + '**': [ + "sidebar/scroll-start.html", + "sidebar/brand.html", + "sidebar/search.html", + "sidebar/navigation.html", + "sidebar/ethical-ads.html", + "sidebar/scroll-end.html", + ] + } + + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + + +# using markdown +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.doctest', + 'sphinx.ext.intersphinx', + 'sphinx.ext.todo', + 'sphinx.ext.coverage', + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', + 'sphinx.ext.autosectionlabel', + 'myst_parser', + "sphinx_copybutton", + "sphinx_inline_tabs", +] + +# 'sphinxcontrib.katex', +# 'sphinx.ext.autosectionlabel', diff --git a/docs/source/configuration.md b/docs/source/configuration.md new file mode 100644 index 00000000..cde7e073 --- /dev/null +++ b/docs/source/configuration.md @@ -0,0 +1,59 @@ +# Configuration + +We use 👩‍✈️[Coqpit] for configuration management. It provides basic static type checking and serialization capabilities on top of native Python `dataclasses`. Here is how a simple configuration looks like with Coqpit. + +```python +from dataclasses import asdict, dataclass, field +from typing import List, Union +from coqpit.coqpit import MISSING, Coqpit, check_argument + + +@dataclass +class SimpleConfig(Coqpit): + val_a: int = 10 + val_b: int = None + val_d: float = 10.21 + val_c: str = "Coqpit is great!" + vol_e: bool = True + # mandatory field + # raise an error when accessing the value if it is not changed. It is a way to define + val_k: int = MISSING + # optional field + val_dict: dict = field(default_factory=lambda: {"val_aa": 10, "val_ss": "This is in a dict."}) + # list of list + val_listoflist: List[List] = field(default_factory=lambda: [[1, 2], [3, 4]]) + val_listofunion: List[List[Union[str, int, bool]]] = field( + default_factory=lambda: [[1, 3], [1, "Hi!"], [True, False]] + ) + + def check_values( + self, + ): # you can define explicit constraints manually or by`check_argument()` + """Check config fields""" + c = asdict(self) # avoid unexpected changes on `self` + check_argument("val_a", c, restricted=True, min_val=10, max_val=2056) + check_argument("val_b", c, restricted=True, min_val=128, max_val=4058, allow_none=True) + check_argument("val_c", c, restricted=True) +``` + +In TTS, each model must have a configuration class that exposes all the values necessary for its lifetime. + +It defines model architecture, hyper-parameters, training, and inference settings. For our models, we merge all the fields in a single configuration class for ease. It may not look like a wise practice but enables easier bookkeeping and reproducible experiments. + +The general configuration hierarchy looks like below: + +``` +ModelConfig() + | + | -> ... # model specific configurations + | -> ModelArgs() # model class arguments + | -> BaseDatasetConfig() # only for tts models + | -> BaseXModelConfig() # Generic fields for `tts` and `vocoder` models. + | + | -> BaseTrainingConfig() # trainer fields + | -> BaseAudioConfig() # audio processing fields +``` + +In the example above, ```ModelConfig()``` is the final configuration that the model receives and it has all the fields necessary for the model. + +We host pre-defined model configurations under ```TTS//configs/```.Although we recommend a unified config class, you can decompose it as you like as for your custom models as long as all the fields for the trainer, model, and inference APIs are provided. \ No newline at end of file diff --git a/docs/source/contributing.md b/docs/source/contributing.md new file mode 100644 index 00000000..5b272509 --- /dev/null +++ b/docs/source/contributing.md @@ -0,0 +1,3 @@ +```{include} ../../CONTRIBUTING.md +:relative-images: +``` diff --git a/docs/source/converting_torch_to_tf.md b/docs/source/converting_torch_to_tf.md new file mode 100644 index 00000000..6b992eb0 --- /dev/null +++ b/docs/source/converting_torch_to_tf.md @@ -0,0 +1,21 @@ +# Converting Torch Tacotron to TF 2 + +Currently, 🐸TTS supports the vanilla Tacotron2 and MelGAN models in TF 2.It does not support advanced attention methods and other small tricks used by the Torch models. You can convert any Torch model trained after v0.0.2. + +You can also export TF 2 models to TFLite for even faster inference. + +## How to convert from Torch to TF 2.0 +Make sure you installed Tensorflow v2.2. It is not installed by default by :frog: TTS. + +All the TF related code stays under ```tf``` folder. + +To convert a **compatible** Torch model, run the following command with the right arguments: + +```bash +python TTS/bin/convert_tacotron2_torch_to_tf.py\ + --torch_model_path /path/to/torch/model.pth.tar \ + --config_path /path/to/model/config.json\ + --output_path /path/to/output/tf/model +``` + +This will create a TF model file. Notice that our model format is not compatible with the official TF checkpoints. We created our custom format to match Torch checkpoints we use. Therefore, use the ```load_checkpoint``` and ```save_checkpoint``` functions provided under ```TTS.tf.generic_utils```. diff --git a/docs/source/dataset.md b/docs/source/dataset.md new file mode 100644 index 00000000..92d381ac --- /dev/null +++ b/docs/source/dataset.md @@ -0,0 +1,25 @@ +# Datasets + +## TTS Dataset + +```{eval-rst} +.. autoclass:: TTS.tts.datasets.TTSDataset + :members: +``` + +## Vocoder Dataset + +```{eval-rst} +.. autoclass:: TTS.vocoder.datasets.gan_dataset.GANDataset + :members: +``` + +```{eval-rst} +.. autoclass:: TTS.vocoder.datasets.wavegrad_dataset.WaveGradDataset + :members: +``` + +```{eval-rst} +.. autoclass:: TTS.vocoder.datasets.wavernn_dataset.WaveRNNDataset + :members: +``` \ No newline at end of file diff --git a/docs/source/faq.md b/docs/source/faq.md new file mode 100644 index 00000000..6f5de6d8 --- /dev/null +++ b/docs/source/faq.md @@ -0,0 +1,114 @@ +# Humble FAQ +We tried to collect common issues and questions we receive about 🐸TTS. It is worth checking before going deeper. + +## Errors with a pre-trained model. How can I resolve this? +- Make sure you use the right commit version of 🐸TTS. Each pre-trained model has its corresponding version that needs to be used. It is defined on the model table. +- If it is still problematic, post your problem on [Discussions](https://github.com/coqui-ai/TTS/discussions). Please give as many details as possible (error message, your TTS version, your TTS model and config.json etc.) +- If you feel like it's a bug to be fixed, then prefer Github issues with the same level of scrutiny. + +## What are the requirements of a good 🐸TTS dataset? +* https://github.com/coqui-ai/TTS/wiki/What-makes-a-good-TTS-dataset + +## How should I choose the right model? +- First, train Tacotron. It is smaller and faster to experiment with. If it performs poorly, try Tacotron2. +- Tacotron models produce the most natural voice if your dataset is not too noisy. +- If both models do not perform well and especially the attention does not align, then try AlignTTS or GlowTTS. +- If you need faster models, consider SpeedySpeech, GlowTTS or AlignTTS. Keep in mind that SpeedySpeech requires a pre-trained Tacotron or Tacotron2 model to compute text-to-speech alignments. + +## How can I train my own `tts` model? +0. Check your dataset with notebooks in [dataset_analysis](https://github.com/coqui-ai/TTS/tree/master/notebooks/dataset_analysis) folder. Use [this notebook](https://github.com/coqui-ai/TTS/blob/master/notebooks/dataset_analysis/CheckSpectrograms.ipynb) to find the right audio processing parameters. A better set of parameters results in a better audio synthesis. + +1. Write your own dataset `formatter` in `datasets/formatters.py` or format your dataset as one of the supported datasets, like LJSpeech. + A `formatter` parses the metadata file and converts a list of training samples. + +2. If you have a dataset with a different alphabet than English, you need to set your own character list in the ```config.json```. + - If you use phonemes for training and your language is supported [here](https://github.com/rhasspy/gruut#supported-languages), you don't need to set your character list. + - You can use `TTS/bin/find_unique_chars.py` to get characters used in your dataset. + +3. Write your own text cleaner in ```utils.text.cleaners```. It is not always necessary, except when you have a different alphabet or language-specific requirements. + - A `cleaner` performs number and abbreviation expansion and text normalization. Basically, it converts the written text to its spoken format. + - If you go lazy, you can try using ```basic_cleaners```. + +4. Fill in a ```config.json```. Go over each parameter one by one and consider it regarding the appended explanation. + - Check the `Coqpit` class created for your target model. Coqpit classes for `tts` models are under `TTS/tts/configs/`. + - You just need to define fields you need/want to change in your `config.json`. For the rest, their default values are used. + - 'sample_rate', 'phoneme_language' (if phoneme enabled), 'output_path', 'datasets', 'text_cleaner' are the fields you need to edit in most of the cases. + - Here is a sample `config.json` for training a `GlowTTS` network. + ```json + { + "model": "glow_tts", + "batch_size": 32, + "eval_batch_size": 16, + "num_loader_workers": 4, + "num_eval_loader_workers": 4, + "run_eval": true, + "test_delay_epochs": -1, + "epochs": 1000, + "text_cleaner": "english_cleaners", + "use_phonemes": false, + "phoneme_language": "en-us", + "phoneme_cache_path": "phoneme_cache", + "print_step": 25, + "print_eval": true, + "mixed_precision": false, + "output_path": "recipes/ljspeech/glow_tts/", + "test_sentences": ["Test this sentence.", "This test sentence.", "Sentence this test."], + "datasets":[{"name": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}] + } + ``` + +6. Train your model. + - SingleGPU training: ```CUDA_VISIBLE_DEVICES="0" python train_tts.py --config_path config.json``` + - MultiGPU training: ```CUDA_VISIBLE_DEVICES="0,1,2" python distribute.py --script train_tts.py --config_path config.json``` + - This command uses all the GPUs given in ```CUDA_VISIBLE_DEVICES```. If you don't specify, it uses all the GPUs available. + +**Note:** You can also train your model using pure 🐍 python. Check ```{eval-rst} :ref: 'tutorial_for_nervous_beginners'```. + +## How can I train in a different language? +- Check steps 2, 3, 4, 5 above. + +## How can I train multi-GPUs? +- Check step 5 above. + +## How can I check model performance? +- You can inspect model training and performance using ```tensorboard```. It will show you loss, attention alignment, model output. Go with the order below to measure the model performance. +1. Check ground truth spectrograms. If they do not look as they are supposed to, then check audio processing parameters in ```config.json```. +2. Check train and eval losses and make sure that they all decrease smoothly in time. +3. Check model spectrograms. Especially, training outputs should look similar to ground truth spectrograms after ~10K iterations. +4. Your model would not work well at test time until the attention has a near diagonal alignment. This is the sublime art of TTS training. + - Attention should converge diagonally after ~50K iterations. + - If attention does not converge, the probabilities are; + - Your dataset is too noisy or small. + - Samples are too long. + - Batch size is too small (batch_size < 32 would be having a hard time converging) + - You can also try other attention algorithms like 'graves', 'bidirectional_decoder', 'forward_attn'. + - 'bidirectional_decoder' is your ultimate savior, but it trains 2x slower and demands 1.5x more GPU memory. + - You can also try the other models like AlignTTS or GlowTTS. + +## How do I know when to stop training? +There is no single objective metric to decide the end of a training since the voice quality is a subjective matter. + +In our model trainings, we follow these steps; + +- Check test time audio outputs, if it does not improve more. +- Check test time attention maps, if they look clear and diagonal. +- Check validation loss, if it converged and smoothly went down or started to overfit going up. +- If the answer is YES for all of the above, then test the model with a set of complex sentences. For English, you can use the `TestAttention` notebook. + +Keep in mind that the approach above only validates the model robustness. It is hard to estimate the voice quality without asking the actual people. +The best approach is to pick a set of promising models and run a Mean-Opinion-Score study asking actual people to score the models. + +## My model does not learn. How can I debug? +- Go over the steps under "How can I check model performance?" + +## Attention does not align. How can I make it work? +- Check the 4th step under "How can I check model performance?" + +## How can I test a trained model? +- The best way is to use `tts` or `tts-server` commands. For details check {ref}`here `. +- If you need to code your own ```TTS.utils.synthesizer.Synthesizer``` class. + +## My Tacotron model does not stop - I see "Decoder stopped with 'max_decoder_steps" - Stopnet does not work. +- In general, all of the above relates to the `stopnet`. It is the part of the model telling the `decoder` when to stop. +- In general, a poor `stopnet` relates to something else that is broken in your model or dataset. Especially the attention module. +- One common reason is the silent parts in the audio clips at the beginning and the ending. Check ```trim_db``` value in the config. You can find a better value for your dataset by using ```CheckSpectrogram``` notebook. If this value is too small, too much of the audio will be trimmed. If too big, then too much silence will remain. Both will curtail the `stopnet` performance. \ No newline at end of file diff --git a/docs/source/formatting_your_dataset.md b/docs/source/formatting_your_dataset.md new file mode 100644 index 00000000..cc0e456a --- /dev/null +++ b/docs/source/formatting_your_dataset.md @@ -0,0 +1,82 @@ +# Formatting Your Dataset + +For training a TTS model, you need a dataset with speech recordings and transcriptions. The speech must be divided into audio clips and each clip needs transcription. + +If you have a single audio file and you need to split it into clips, there are different open-source tools for you. We recommend Audacity. It is an open-source and free audio editing software. + +It is also important to use a lossless audio file format to prevent compression artifacts. We recommend using `wav` file format. + +Let's assume you created the audio clips and their transcription. You can collect all your clips under a folder. Let's call this folder `wavs`. + +``` +/wavs + | - audio1.wav + | - audio2.wav + | - audio3.wav + ... +``` + +You can either create separate transcription files for each clip or create a text file that maps each audio clip to its transcription. In this file, each line must be delimitered by a special character separating the audio file name from the transcription. And make sure that the delimiter is not used in the transcription text. + +We recommend the following format delimited by `|`. + +``` +# metadata.txt + +audio1.wav | This is my sentence. +audio2.wav | This is maybe my sentence. +audio3.wav | This is certainly my sentence. +audio4.wav | Let this be your sentence. +... +``` + +In the end, we have the following folder structure +``` +/MyTTSDataset + | + | -> metadata.txt + | -> /wavs + | -> audio1.wav + | -> audio2.wav + | ... +``` + +The format above is taken from widely-used the [LJSpeech](https://keithito.com/LJ-Speech-Dataset/) dataset. You can also download and see the dataset. 🐸TTS already provides tooling for the LJSpeech. if you use the same format, you can start training your models right away. + +## Dataset Quality + +Your dataset should have good coverage of the target language. It should cover the phonemic variety, exceptional sounds and syllables. This is extremely important for especially non-phonemic languages like English. + +For more info about dataset qualities and properties check our [post](https://github.com/coqui-ai/TTS/wiki/What-makes-a-good-TTS-dataset). + +## Using Your Dataset in 🐸TTS + +After you collect and format your dataset, you need to check two things. Whether you need a `formatter` and a `text_cleaner`. The `formatter` loads the text file (created above) as a list and the `text_cleaner` performs a sequence of text normalization operations that converts the raw text into the spoken representation (e.g. converting numbers to text, acronyms, and symbols to the spoken format). + +If you use a different dataset format then the LJSpeech or the other public datasets that 🐸TTS supports, then you need to write your own `formatter`. + +If your dataset is in a new language or it needs special normalization steps, then you need a new `text_cleaner`. + +What you get out of a `formatter` is a `List[List[]]` in the following format. + +``` +>>> formatter(metafile_path) +[["audio1.wav", "This is my sentence.", "MyDataset"], +["audio1.wav", "This is maybe a sentence.", "MyDataset"], +... +] +``` + +Each sub-list is parsed as ```["", "", "]```. +`````` is the dataset name for single speaker datasets and it is mainly used +in the multi-speaker models to map the speaker of the each sample. But for now, we only focus on single speaker datasets. + +The purpose of a `formatter` is to parse your metafile and load the audio file paths and transcriptions. Then, its output passes to a `Dataset` object. It computes features from the audio signals, calls text normalization routines, and converts raw text to +phonemes if needed. + +See `TTS.tts.datasets.TTSDataset`, a generic `Dataset` implementation for the `tts` models. + +See `TTS.vocoder.datasets.*`, for different `Dataset` implementations for the `vocoder` models. + +See `TTS.utils.audio.AudioProcessor` that includes all the audio processing and feature extraction functions used in a +`Dataset` implementation. Feel free to add things as you need.passed \ No newline at end of file diff --git a/docs/source/implementing_a_new_model.md b/docs/source/implementing_a_new_model.md new file mode 100644 index 00000000..5a9aaae7 --- /dev/null +++ b/docs/source/implementing_a_new_model.md @@ -0,0 +1,61 @@ +# Implementing a Model + +1. Implement layers. + + You can either implement the layers under `TTS/tts/layers/new_model.py` or in the model file `TTS/tts/model/new_model.py`. + You can also reuse layers already implemented. + +2. Test layers. + + We keep tests under `tests` folder. You can add `tts` layers tests under `tts_tests` folder. + Basic tests are checking input-output tensor shapes and output values for a given input. Consider testing extreme cases that are more likely to cause problems like `zero` tensors. + +3. Implement loss function. + + We keep loss functions under `TTS/tts/layers/losses.py`. You can also mix-and-match implemented loss functions as you like. + + A loss function returns a dictionary in a format ```{’loss’: loss, ‘loss1’:loss1 ...}``` and the dictionary must at least define the `loss` key which is the actual value used by the optimizer. All the items in the dictionary are automatically logged on the terminal and the Tensorboard. + +4. Test the loss function. + + As we do for the layers, you need to test the loss functions too. You need to check input/output tensor shapes, + expected output values for a given input tensor. For instance, certain loss functions have upper and lower limits and + it is a wise practice to test with the inputs that should produce these limits. + +5. Implement `MyModel`. + + In 🐸TTS, a model class is a self-sufficient implementation of a model directing all the interactions with the other + components. It is enough to implement the API provided by the `BaseModel` class to comply. + + A model interacts with the `Trainer API` for training, `Synthesizer API` for inference and testing. + + A 🐸TTS model must return a dictionary by the `forward()` and `inference()` functions. This dictionary must also include the `model_outputs` key that is considered as the main model output by the `Trainer` and `Synthesizer`. + + You can place your `tts` model implementation under `TTS/tts/models/new_model.py` then inherit and implement the `BaseTTS`. + + There is also the `callback` interface by which you can manipulate both the model and the `Trainer` states. Callbacks give you + the infinite flexibility to add custom behaviours for your model and training routines. + + For more details, see {ref}`BaseTTS ` and `TTS/utils/callbacks.py`. + +6. Optionally, define `MyModelArgs`. + + `MyModelArgs` is a 👨‍✈️Coqpit class that sets all the class arguments of the `MyModel`. It should be enough to pass + an `MyModelArgs` instance to initiate the `MyModel`. + +7. Test `MyModel`. + + As the layers and the loss functions, it is recommended to test your model. One smart way for testing is that you + create two models with the exact same weights. Then we run a training loop with one of these models and + compare the weights with the other model. All the weights need to be different in a passing test. Otherwise, it + is likely that a part of the model is malfunctioning or not even attached to the model's computational graph. + +8. Define `MyModelConfig`. + + Place `MyModelConfig` file under `TTS/models/configs`. It is enough to inherit the `BaseTTSConfig` to make your + config compatible with the `Trainer`. You should also include `MyModelArgs` as a field if defined. The rest of the fields should define the model + specific values and parameters. + +9. Write Docstrings. + + We love you more when you document your code. ❤️ diff --git a/docs/source/index.md b/docs/source/index.md new file mode 100644 index 00000000..82792fee --- /dev/null +++ b/docs/source/index.md @@ -0,0 +1,40 @@ + +```{include} ../../README.md +:relative-images: +``` + +---- + +# Documentation Content +```{eval-rst} +.. toctree:: + :maxdepth: 2 + :caption: Get started + + tutorial_for_nervous_beginners + installation + faq + contributing + +.. toctree:: + :maxdepth: 2 + :caption: Using 🐸TTS + + inference + implementing_a_new_model + training_a_model + configuration + formatting_your_dataset + what_makes_a_good_dataset + tts_datasets + +.. toctree:: + :maxdepth: 2 + :caption: Main Classes + + trainer_api + audio_processor + model_api + configuration + dataset +``` \ No newline at end of file diff --git a/docs/source/inference.md b/docs/source/inference.md new file mode 100644 index 00000000..544473bf --- /dev/null +++ b/docs/source/inference.md @@ -0,0 +1,103 @@ +(synthesizing_speech)= +# Synthesizing Speech + +First, you need to install TTS. We recommend using PyPi. You need to call the command below: + +```bash +$ pip install TTS +``` + +After the installation, 2 terminal commands are available. + +1. TTS Command Line Interface (CLI). - `tts` +2. Local Demo Server. - `tts-server` + +## On the Commandline - `tts` +![cli.gif](https://github.com/coqui-ai/TTS/raw/main/images/tts_cli.gif) + +After the installation, 🐸TTS provides a CLI interface for synthesizing speech using pre-trained models. You can either use your own model or the release models under 🐸TTS. + +Listing released 🐸TTS models. + +```bash +tts --list_models +``` + +Run a TTS model, from the release models list, with its default vocoder. (Simply copy and paste the full model names from the list as arguments for the command below.) + +```bash +tts --text "Text for TTS" \ + --model_name "///" \ + --out_path folder/to/save/output.wav +``` + +Run a tts and a vocoder model from the released model list. Note that not every vocoder is compatible with every TTS model. + +```bash +tts --text "Text for TTS" \ + --model_name "///" \ + --vocoder_name "///" \ + --out_path folder/to/save/output.wav +``` + +Run your own TTS model (Using Griffin-Lim Vocoder) + +```bash +tts --text "Text for TTS" \ + --model_path path/to/model.pth.tar \ + --config_path path/to/config.json \ + --out_path folder/to/save/output.wav +``` + +Run your own TTS and Vocoder models + +```bash +tts --text "Text for TTS" \ + --config_path path/to/config.json \ + --model_path path/to/model.pth.tar \ + --out_path folder/to/save/output.wav \ + --vocoder_path path/to/vocoder.pth.tar \ + --vocoder_config_path path/to/vocoder_config.json +``` + +Run a multi-speaker TTS model from the released models list. + +```bash +tts --model_name "///" --list_speaker_idxs # list the possible speaker IDs. +tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "//" --speaker_idx "" +``` + +**Note:** You can use ```./TTS/bin/synthesize.py``` if you prefer running ```tts``` from the TTS project folder. + +## On the Demo Server - `tts-server` + + +![server.gif](https://github.com/coqui-ai/TTS/raw/main/images/demo_server.gif) + +You can boot up a demo 🐸TTS server to run an inference with your models. Note that the server is not optimized for performance +but gives you an easy way to interact with the models. + +The demo server provides pretty much the same interface as the CLI command. + +```bash +tts-server -h # see the help +tts-server --list_models # list the available models. +``` + +Run a TTS model, from the release models list, with its default vocoder. +If the model you choose is a multi-speaker TTS model, you can select different speakers on the Web interface and synthesize +speech. + +```bash +tts-server --model_name "///" +``` + +Run a TTS and a vocoder model from the released model list. Note that not every vocoder is compatible with every TTS model. + +```bash +tts-server --model_name "///" \ + --vocoder_name "///" +``` + +## TorchHub +You can also use [this simple colab notebook](https://colab.research.google.com/drive/1iAe7ZdxjUIuN6V4ooaCt0fACEGKEn7HW?usp=sharing) using TorchHub to synthesize speech. \ No newline at end of file diff --git a/docs/source/installation.md b/docs/source/installation.md new file mode 100644 index 00000000..6532ee8e --- /dev/null +++ b/docs/source/installation.md @@ -0,0 +1,39 @@ +# Installation + +🐸TTS supports python >=3.6 <=3.9 and tested on Ubuntu 18.10, 19.10, 20.10. + +## Using `pip` + +`pip` is recommended if you want to use 🐸TTS only for inference. + +You can install from PyPI as follows: + +```bash +pip install TTS # from PyPI +``` + +By default, this only installs the requirements for PyTorch. To install the tensorflow dependencies as well, use the `tf` extra. + +```bash +pip install TTS[tf] +``` + +Or install from Github: + +```bash +pip install git+https://github.com/coqui-ai/TTS # from Github +``` + +## Installing From Source + +This is recommended for development and more control over 🐸TTS. + +```bash +git clone https://github.com/coqui-ai/TTS/ +cd TTS +make system-deps # only on Linux systems. +make install +``` + +## On Windows +If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/ \ No newline at end of file diff --git a/docs/source/make.bat b/docs/source/make.bat new file mode 100644 index 00000000..922152e9 --- /dev/null +++ b/docs/source/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/model_api.md b/docs/source/model_api.md new file mode 100644 index 00000000..438901b7 --- /dev/null +++ b/docs/source/model_api.md @@ -0,0 +1,24 @@ +# Model API +Model API provides you a set of functions that easily make your model compatible with the `Trainer`, +`Synthesizer` and `ModelZoo`. + +## Base TTS Model + +```{eval-rst} +.. autoclass:: TTS.model.BaseModel + :members: +``` + +## Base `tts` Model + +```{eval-rst} +.. autoclass:: TTS.tts.models.base_tts.BaseTTS + :members: +``` + +## Base `vocoder` Model + +```{eval-rst} +.. autoclass:: TTS.tts.models.base_vocoder.BaseVocoder` + :members: +``` \ No newline at end of file diff --git a/docs/source/readthedocs.yml b/docs/source/readthedocs.yml new file mode 100644 index 00000000..59eed1f7 --- /dev/null +++ b/docs/source/readthedocs.yml @@ -0,0 +1,17 @@ +# .readthedocs.yml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Build documentation in the docs/ directory with Sphinx +sphinx: + builder: html + configuration: docs/conf.py + +# Optionally set the version of Python and requirements required to build your docs +python: + version: 3.8 + install: + - requirements: doc/requirements.txt \ No newline at end of file diff --git a/docs/source/trainer_api.md b/docs/source/trainer_api.md new file mode 100644 index 00000000..a5c3cfb7 --- /dev/null +++ b/docs/source/trainer_api.md @@ -0,0 +1,17 @@ +# Trainer API + +The {class}`TTS.trainer.Trainer` provides a lightweight, extensible, and feature-complete training run-time. We optimized it for 🐸 but +can also be used for any DL training in different domains. It supports distributed multi-gpu, mixed-precision (apex or torch.amp) training. + + +## Trainer +```{eval-rst} +.. autoclass:: TTS.trainer.Trainer + :members: +``` + +## TrainingArgs +```{eval-rst} +.. autoclass:: TTS.trainer.TrainingArgs + :members: +``` \ No newline at end of file diff --git a/docs/source/training_a_model.md b/docs/source/training_a_model.md new file mode 100644 index 00000000..a7e81f28 --- /dev/null +++ b/docs/source/training_a_model.md @@ -0,0 +1,165 @@ +# Training a Model + +1. Decide what model you want to use. + + Each model has a different set of pros and cons that define the run-time efficiency and the voice quality. It is up to you to decide what model servers your needs. Other than referring to the papers, one easy way is to test the 🐸TTS + community models and see how fast and good each of the models. Or you can start a discussion on our communication channels. + +2. Understand the configuration class, its fields and values of your model. + + For instance, if you want to train a `Tacotron` model then see the `TacotronConfig` class and make sure you understand it. + +3. Go to the recipes and check the recipe of your target model. + + Recipes do not promise perfect models but they provide a good start point for `Nervous Beginners`. A recipe script training + a `GlowTTS` model on `LJSpeech` dataset looks like below. Let's be creative and call this script `train_glowtts.py`. + + ```python + # train_glowtts.py + + import os + + from TTS.tts.configs import GlowTTSConfig + from TTS.tts.configs import BaseDatasetConfig + from TTS.trainer import init_training, Trainer, TrainingArgs + + + output_path = os.path.dirname(os.path.abspath(__file__)) + dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")) + config = GlowTTSConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + text_cleaner="english_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + print_step=25, + print_eval=True, + mixed_precision=False, + output_path=output_path, + datasets=[dataset_config] + ) + args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) + trainer = Trainer(args, config, output_path, c_logger, tb_logger) + trainer.fit() + ``` + + You need to change fields of the `BaseDatasetConfig` to match your own dataset and then update `GlowTTSConfig` + fields as you need. + + 4. Run the training. + + You need to call the python training script. + + ```bash + $ CUDA_VISIBLE_DEVICES="0" python train_glowtts.py + ``` + + Notice that you set the GPU you want to use on your system by setting `CUDA_VISIBLE_DEVICES` environment variable. + To see available GPUs on your system, you can use `nvidia-smi` command on the terminal. + + If you like to run a multi-gpu training + + ```bash + $ CUDA_VISIBLE_DEVICES="0, 1, 2" python TTS/bin/distribute.py --script /train_glowtts.py + ``` + + The example above runs a multi-gpu training using GPUs `0, 1, 2`. + + The beginning of a training run looks like below. + + ```console + > Experiment folder: /your/output_path/-Juni-23-2021_02+52-78899209 + > Using CUDA: True + > Number of GPUs: 1 + > Setting up Audio Processor... + | > sample_rate:22050 + | > resample:False + | > num_mels:80 + | > min_level_db:-100 + | > frame_shift_ms:None + | > frame_length_ms:None + | > ref_level_db:20 + | > fft_size:1024 + | > power:1.5 + | > preemphasis:0.0 + | > griffin_lim_iters:60 + | > signal_norm:True + | > symmetric_norm:True + | > mel_fmin:0 + | > mel_fmax:None + | > spec_gain:20.0 + | > stft_pad_mode:reflect + | > max_norm:4.0 + | > clip_norm:True + | > do_trim_silence:True + | > trim_db:45 + | > do_sound_norm:False + | > stats_path:None + | > base:10 + | > hop_length:256 + | > win_length:1024 + | > Found 13100 files in /your/dataset/path/ljspeech/LJSpeech-1.1 + > Using model: glow_tts + + > Model has 28356129 parameters + + > EPOCH: 0/1000 + + > DataLoader initialization + | > Use phonemes: False + | > Number of instances : 12969 + | > Max length sequence: 187 + | > Min length sequence: 5 + | > Avg length sequence: 98.3403500655409 + | > Num. instances discarded by max-min (max=500, min=3) seq limits: 0 + | > Batch group size: 0. + + > TRAINING (2021-06-23 14:52:54) + + --> STEP: 0/405 -- GLOBAL_STEP: 0 + | > loss: 2.34670 + | > log_mle: 1.61872 + | > loss_dur: 0.72798 + | > align_error: 0.52744 + | > current_lr: 2.5e-07 + | > grad_norm: 5.036039352416992 + | > step_time: 5.8815 + | > loader_time: 0.0065 + ... + ``` + +5. Run the Tensorboard. + + ```bash + $ tensorboard --logdir= + ``` + +6. Check the logs and the Tensorboard and monitor the training. + + On the terminal and Tensorboard, you can monitor the losses and their changes over time. Also Tensorboard provides certain figures and sample outputs. + + Note that different models have different metrics, visuals and outputs to be displayed. + + You should also check the [FAQ page](https://github.com/coqui-ai/TTS/wiki/FAQ) for common problems and solutions + that occur in a training. + +7. Use your best model for inference. + + Use `tts` or `tts-server` commands for testing your models. + + ```bash + $ tts --text "Text for TTS" \ + --model_path path/to/checkpoint_x.pth.tar \ + --config_path path/to/config.json \ + --out_path folder/to/save/output.wav + ``` + +8. Return to the step 1 and reiterate for training a `vocoder` model. + + In the example above, we trained a `GlowTTS` model, but the same workflow applies to all the other 🐸TTS models. diff --git a/docs/source/tts_datasets.md b/docs/source/tts_datasets.md new file mode 100644 index 00000000..6075bc95 --- /dev/null +++ b/docs/source/tts_datasets.md @@ -0,0 +1,16 @@ +# TTS Datasets + +Some of the known public datasets that we successfully applied 🐸TTS: + +- [English - LJ Speech](https://keithito.com/LJ-Speech-Dataset/) +- [English - Nancy](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/) +- [English - TWEB](https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset) +- [English - LibriTTS](https://openslr.org/60/) +- [English - VCTK](https://datashare.ed.ac.uk/handle/10283/2950) +- [Multilingual - M-AI-Labs](http://www.caito.de/2019/01/the-m-ailabs-speech-dataset/) +- [Spanish](https://drive.google.com/file/d/1Sm_zyBo67XHkiFhcRSQ4YaHPYM0slO_e/view?usp=sharing) - thx! @carlfm01 +- [German - Thorsten OGVD](https://github.com/thorstenMueller/deep-learning-german-tts) +- [Japanese - Kokoro](https://www.kaggle.com/kaiida/kokoro-speech-dataset-v11-small/version/1) +- [Chinese](https://www.data-baker.com/open_source.html) + +Let us know if you use 🐸TTS on a different dataset. \ No newline at end of file diff --git a/docs/source/tutorial_for_nervous_beginners.md b/docs/source/tutorial_for_nervous_beginners.md new file mode 100644 index 00000000..015e178d --- /dev/null +++ b/docs/source/tutorial_for_nervous_beginners.md @@ -0,0 +1,175 @@ +# Tutorial For Nervous Beginners + +## Installation + +User friendly installation. Recommended only for synthesizing voice. + +```bash +$ pip install TTS +``` + +Developer friendly installation. + +```bash +$ git clone https://github.com/coqui-ai/TTS +$ cd TTS +$ pip install -e . +``` + +## Training a `tts` Model + +A breakdown of a simple script training a GlowTTS model on LJspeech dataset. See the comments for the explanation of +each line. + +### Pure Python Way + +```python +import os + +# GlowTTSConfig: all model related values for training, validating and testing. +from TTS.tts.configs import GlowTTSConfig + +# BaseDatasetConfig: defines name, formatter and path of the dataset. +from TTS.tts.configs import BaseDatasetConfig + +# init_training: Initialize and setup the training environment. +# Trainer: Where the ✨️ happens. +# TrainingArgs: Defines the set of arguments of the Trainer. +from TTS.trainer import init_training, Trainer, TrainingArgs + +# we use the same path as this script as our training folder. +output_path = os.path.dirname(os.path.abspath(__file__)) + +# set LJSpeech as our target dataset and define its path so that the Trainer knows what data formatter it needs. +dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")) + +# Configure the model. Every config class inherits the BaseTTSConfig to have all the fields defined for the Trainer. +config = GlowTTSConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + text_cleaner="english_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + print_step=25, + print_eval=True, + mixed_precision=False, + output_path=output_path, + datasets=[dataset_config] +) + +# Take the config and the default Trainer arguments, setup the training environment and override the existing +# config values from the terminal. So you can do the following. +# >>> python train.py --coqpit.batch_size 128 +args, config, output_path, _, _, _= init_training(TrainingArgs(), config) + +# Initiate the Trainer. +# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, +# distributed training etc. +trainer = Trainer(args, config, output_path) + +# And kick it 🚀 +trainer.fit() +``` + +### CLI Way + +We still support running training from CLI like in the old days. The same training can be started as follows. + +1. Define your `config.json` + + ```json + { + "model": "glow_tts", + "batch_size": 32, + "eval_batch_size": 16, + "num_loader_workers": 4, + "num_eval_loader_workers": 4, + "run_eval": true, + "test_delay_epochs": -1, + "epochs": 1000, + "text_cleaner": "english_cleaners", + "use_phonemes": false, + "phoneme_language": "en-us", + "phoneme_cache_path": "phoneme_cache", + "print_step": 25, + "print_eval": true, + "mixed_precision": false, + "output_path": "recipes/ljspeech/glow_tts/", + "datasets":[{"name": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}] + } + ``` + +2. Start training. + ```bash + $ CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path config.json + ``` + + + +## Training a `vocoder` Model + +```python +import os + +from TTS.vocoder.configs import HifiganConfig +from TTS.trainer import init_training, Trainer, TrainingArgs + + +output_path = os.path.dirname(os.path.abspath(__file__)) +config = HifiganConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + seq_len=8192, + pad_short=2000, + use_noise_augment=True, + eval_split_size=10, + print_step=25, + print_eval=True, + mixed_precision=False, + lr_gen=1e-4, + lr_disc=1e-4, + # `vocoder` only needs a data path and they read recursively all the `.wav` files underneath. + data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), + output_path=output_path, +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger) +trainer.fit() +``` + +❗️ Note that you can also start the training run from CLI as the `tts` model above. + +## Synthesizing Speech + +You can run `tts` and synthesize speech directly on the terminal. + +```bash +$ tts -h # see the help +$ tts --list_models # list the available models. +``` + +![cli.gif](https://github.com/coqui-ai/TTS/raw/main/images/tts_cli.gif) + + +You can call `tts-server` to start a local demo server that you can open it on +your favorite web browser and 🗣️. + +```bash +$ tts-server -h # see the help +$ tts-server --list_models # list the available models. +``` +![server.gif](https://github.com/coqui-ai/TTS/raw/main/images/demo_server.gif) + + + diff --git a/docs/source/what_makes_a_good_dataset.md b/docs/source/what_makes_a_good_dataset.md new file mode 100644 index 00000000..49a2943b --- /dev/null +++ b/docs/source/what_makes_a_good_dataset.md @@ -0,0 +1,19 @@ +# What makes a good TTS dataset + +## What Makes a Good Dataset +* **Gaussian like distribution on clip and text lengths**. So plot the distribution of clip lengths and check if it covers enough short and long voice clips. +* **Mistake free**. Remove any wrong or broken files. Check annotations, compare transcript and audio length. +* **Noise free**. Background noise might lead your model to struggle, especially for a good alignment. Even if it learns the alignment, the final result is likely to be suboptimial. +* **Compatible tone and pitch among voice clips**. For instance, if you are using audiobook recordings for your project, it might have impersonations for different characters in the book. These differences between samples downgrade the model performance. +* **Good phoneme coverage**. Make sure that your dataset covers a good portion of the phonemes, di-phonemes, and in some languages tri-phonemes. +* **Naturalness of recordings**. For your model WISIAIL (What it sees is all it learns). Therefore, your dataset should accommodate all the attributes you want to hear from your model. + +## Preprocessing Dataset +If you like to use a bespoken dataset, you might like to perform a couple of quality checks before training. 🐸TTS provides a couple of notebooks (CheckSpectrograms, AnalyzeDataset) to expedite this part for you. + +* **AnalyzeDataset** is for checking dataset distribution in terms of the clip and transcript lengths. It is good to find outlier instances (too long, short text but long voice clip, etc.)and remove them before training. Keep in mind that we like to have a good balance between long and short clips to prevent any bias in training. If you have only short clips (1-3 secs), then your model might suffer for long sentences and if your instances are long, then it might not learn the alignment or might take too long to train the model. + +* **CheckSpectrograms** is to measure the noise level of the clips and find good audio processing parameters. The noise level might be observed by checking spectrograms. If spectrograms look cluttered, especially in silent parts, this dataset might not be a good candidate for a TTS project. If your voice clips are too noisy in the background, it makes things harder for your model to learn the alignment, and the final result might be different than the voice you are given. +If the spectrograms look good, then the next step is to find a good set of audio processing parameters, defined in ```config.json```. In the notebook, you can compare different sets of parameters and see the resynthesis results in relation to the given ground-truth. Find the best parameters that give the best possible synthesis performance. + +Another practical detail is the quantization level of the clips. If your dataset has a very high bit-rate, that might cause slow data-load time and consequently slow training. It is better to reduce the sample-rate of your dataset to around 16000-22050. \ No newline at end of file From ae6405bb76baece21a69c65cc45325855f223a80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 28 Jun 2021 16:53:57 +0200 Subject: [PATCH 229/258] Docstrings for `Trainer` --- TTS/trainer.py | 68 ++++++++++++++++++++++++++--- TTS/tts/layers/tacotron/tacotron.py | 2 + TTS/tts/utils/text/__init__.py | 1 + 3 files changed, 66 insertions(+), 5 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index e3403bae..0e921335 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -4,6 +4,7 @@ import glob import importlib import logging import os +import platform import re import sys import time @@ -40,13 +41,21 @@ from TTS.utils.trainer_utils import get_optimizer, get_scheduler, is_apex_availa from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data from TTS.vocoder.models import setup_model as setup_vocoder_model +if platform.system() != "Windows": + # https://github.com/pytorch/pytorch/issues/973 + import resource + + rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) + resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1])) + if is_apex_available(): from apex import amp @dataclass class TrainingArgs(Coqpit): - """Trainer arguments""" + """Trainer arguments to be defined externally. It helps integrating the `Trainer` with the higher level APIs and + set the values for distributed training.""" continue_path: str = field( default="", @@ -360,7 +369,7 @@ class Trainer: return self._get_loader(self.model, self.config, ap, True, data_items, verbose, self.num_gpus) def format_batch(self, batch: List) -> Dict: - """Format dataloader ouput and return a batch. + """Format the dataloader output and return a batch. Args: batch (List): Batch returned by the dataloader. @@ -633,7 +642,7 @@ class Trainer: return outputs, loss_dict def train_epoch(self) -> None: - """Main entry point for training. Run training on the whole training samples.""" + """Main entry point for the training loop. Run training on the all training samples.""" self.train_loader = self.get_train_dataloader( self.ap, self.data_train, @@ -682,6 +691,15 @@ class Trainer: return model.eval_step(*input_args) def eval_step(self, batch: Dict, step: int) -> Tuple[Dict, Dict]: + """Perform a evaluation step on a batch of inputs and log the process. + + Args: + batch (Dict): Input batch. + step (int): Current step number in this epoch. + + Returns: + Tuple[Dict, Dict]: Model outputs and losses. + """ with torch.no_grad(): outputs_per_optimizer = None loss_dict = {} @@ -708,6 +726,7 @@ class Trainer: return outputs, loss_dict def eval_epoch(self) -> None: + """Main entry point for the evaluation loop. Run evaluation on the all validation samples.""" self.eval_loader = ( self.get_eval_dataloader( self.ap, @@ -743,7 +762,7 @@ class Trainer: def test_run(self) -> None: """Run test and log the results. Test run must be defined by the model. - Model must return figures and audios to be logged by the Tensorboard logger.""" + Model must return figures and audios to be logged by the Tensorboard.""" if hasattr(self.model, "test_run"): if hasattr(self.eval_loader.load_test_samples): samples = self.eval_loader.load_test_samples(1) @@ -820,11 +839,22 @@ class Trainer: ) @staticmethod - def _is_apex_available(): + def _is_apex_available() -> bool: + """Check if Nvidia's APEX is available.""" return importlib.util.find_spec("apex") is not None @staticmethod def get_optimizer(model: nn.Module, config: Coqpit) -> Union[torch.optim.Optimizer, List]: + """Receive the optimizer from the model if model implements `get_optimizer()` else + check the optimizer parameters in the config and try initiating the optimizer. + + Args: + model (nn.Module): Training model. + config (Coqpit): Training configuration. + + Returns: + Union[torch.optim.Optimizer, List]: A optimizer or a list of optimizers. GAN models define a list. + """ if hasattr(model, "get_optimizer"): optimizer = model.get_optimizer() if optimizer is None: @@ -835,6 +865,16 @@ class Trainer: @staticmethod def get_lr(model: nn.Module, config: Coqpit) -> Union[float, List[float]]: + """Set the initial learning rate by the model if model implements `get_lr()` else try setting the learning rate + fromthe config. + + Args: + model (nn.Module): Training model. + config (Coqpit): Training configuration. + + Returns: + Union[float, List[float]]: A single learning rate or a list of learning rates, one for each optimzier. + """ lr = None if hasattr(model, "get_lr"): lr = model.get_lr() @@ -846,6 +886,16 @@ class Trainer: def get_scheduler( model: nn.Module, config: Coqpit, optimizer: Union[torch.optim.Optimizer, List] ) -> Union[torch.optim.lr_scheduler._LRScheduler, List]: # pylint: disable=protected-access + """Receive the scheduler from the model if model implements `get_scheduler()` else + check the config and try initiating the scheduler. + + Args: + model (nn.Module): Training model. + config (Coqpit): Training configuration. + + Returns: + Union[torch.optim.Optimizer, List]: A scheduler or a list of schedulers, one for each optimizer. + """ scheduler = None if hasattr(model, "get_scheduler"): scheduler = model.get_scheduler(optimizer) @@ -857,6 +907,14 @@ class Trainer: @staticmethod def get_criterion(model: nn.Module) -> nn.Module: + """Receive the criterion from the model. Model must implement `get_criterion()`. + + Args: + model (nn.Module): Training model. + + Returns: + nn.Module: Criterion layer. + """ criterion = None criterion = model.get_criterion() return criterion diff --git a/TTS/tts/layers/tacotron/tacotron.py b/TTS/tts/layers/tacotron/tacotron.py index a6579171..47b5ea7e 100644 --- a/TTS/tts/layers/tacotron/tacotron.py +++ b/TTS/tts/layers/tacotron/tacotron.py @@ -1,4 +1,6 @@ # coding: utf-8 +# adapted from https://github.com/r9y9/tacotron_pytorch + import torch from torch import nn diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 787394b5..fdccf7f1 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +# adapted from https://github.com/keithito/tacotron import re import unicodedata From 51398cd15bb2fb03981ba97810c3817bbb3ca518 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 27 Jun 2021 20:56:11 +0200 Subject: [PATCH 230/258] Add docstrings and typing for `audio.py` --- TTS/utils/audio.py | 221 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 190 insertions(+), 31 deletions(-) diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index e1913e98..3706b4ec 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -1,3 +1,5 @@ +from typing import Dict, Tuple + import librosa import numpy as np import scipy.io.wavfile @@ -217,7 +219,12 @@ class AudioProcessor(object): ### setting up the parameters ### def _build_mel_basis( self, - ): + ) -> np.ndarray: + """Build melspectrogram basis. + + Returns: + np.ndarray: melspectrogram basis. + """ if self.mel_fmax is not None: assert self.mel_fmax <= self.sample_rate // 2 return librosa.filters.mel( @@ -226,8 +233,12 @@ class AudioProcessor(object): def _stft_parameters( self, - ): - """Compute necessary stft parameters with given time values""" + ) -> Tuple[int, int]: + """Compute the real STFT parameters from the time values. + + Returns: + Tuple[int, int]: hop length and window length for STFT. + """ factor = self.frame_length_ms / self.frame_shift_ms assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms" hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate) @@ -235,8 +246,18 @@ class AudioProcessor(object): return hop_length, win_length ### normalization ### - def normalize(self, S): - """Put values in [0, self.max_norm] or [-self.max_norm, self.max_norm]""" + def normalize(self, S: np.ndarray) -> np.ndarray: + """Normalize values into `[0, self.max_norm]` or `[-self.max_norm, self.max_norm]` + + Args: + S (np.ndarray): Spectrogram to normalize. + + Raises: + RuntimeError: Mean and variance is computed from incompatible parameters. + + Returns: + np.ndarray: Normalized spectrogram. + """ # pylint: disable=no-else-return S = S.copy() if self.signal_norm: @@ -266,8 +287,18 @@ class AudioProcessor(object): else: return S - def denormalize(self, S): - """denormalize values""" + def denormalize(self, S: np.ndarray) -> np.ndarray: + """Denormalize spectrogram values. + + Args: + S (np.ndarray): Spectrogram to denormalize. + + Raises: + RuntimeError: Mean and variance are incompatible. + + Returns: + np.ndarray: Denormalized spectrogram. + """ # pylint: disable=no-else-return S_denorm = S.copy() if self.signal_norm: @@ -295,7 +326,16 @@ class AudioProcessor(object): return S_denorm ### Mean-STD scaling ### - def load_stats(self, stats_path): + def load_stats(self, stats_path: str) -> Tuple[np.array, np.array, np.array, np.array, Dict]: + """Loading mean and variance statistics from a `npy` file. + + Args: + stats_path (str): Path to the `npy` file containing + + Returns: + Tuple[np.array, np.array, np.array, np.array, Dict]: loaded statistics and the config used to + compute them. + """ stats = np.load(stats_path, allow_pickle=True).item() # pylint: disable=unexpected-keyword-arg mel_mean = stats["mel_mean"] mel_std = stats["mel_std"] @@ -314,7 +354,17 @@ class AudioProcessor(object): return mel_mean, mel_std, linear_mean, linear_std, stats_config # pylint: disable=attribute-defined-outside-init - def setup_scaler(self, mel_mean, mel_std, linear_mean, linear_std): + def setup_scaler( + self, mel_mean: np.ndarray, mel_std: np.ndarray, linear_mean: np.ndarray, linear_std: np.ndarray + ) -> None: + """Initialize scaler objects used in mean-std normalization. + + Args: + mel_mean (np.ndarray): Mean for melspectrograms. + mel_std (np.ndarray): STD for melspectrograms. + linear_mean (np.ndarray): Mean for full scale spectrograms. + linear_std (np.ndarray): STD for full scale spectrograms. + """ self.mel_scaler = StandardScaler() self.mel_scaler.set_stats(mel_mean, mel_std) self.linear_scaler = StandardScaler() @@ -322,32 +372,78 @@ class AudioProcessor(object): ### DB and AMP conversion ### # pylint: disable=no-self-use - def _amp_to_db(self, x): + def _amp_to_db(self, x: np.ndarray) -> np.ndarray: + """Convert amplitude values to decibels. + + Args: + x (np.ndarray): Amplitude spectrogram. + + Returns: + np.ndarray: Decibels spectrogram. + """ + return self.spec_gain * _log(np.maximum(1e-5, x), self.base) # pylint: disable=no-self-use - def _db_to_amp(self, x): + def _db_to_amp(self, x: np.ndarray) -> np.ndarray: + """Convert decibels spectrogram to amplitude spectrogram. + + Args: + x (np.ndarray): Decibels spectrogram. + + Returns: + np.ndarray: Amplitude spectrogram. + """ return _exp(x / self.spec_gain, self.base) ### Preemphasis ### - def apply_preemphasis(self, x): + def apply_preemphasis(self, x: np.ndarray) -> np.ndarray: + """Apply pre-emphasis to the audio signal. Useful to reduce the correlation between neighbouring signal values. + + Args: + x (np.ndarray): Audio signal. + + Raises: + RuntimeError: Preemphasis coeff is set to 0. + + Returns: + np.ndarray: Decorrelated audio signal. + """ if self.preemphasis == 0: raise RuntimeError(" [!] Preemphasis is set 0.0.") return scipy.signal.lfilter([1, -self.preemphasis], [1], x) - def apply_inv_preemphasis(self, x): + def apply_inv_preemphasis(self, x: np.ndarray) -> np.ndarray: + """Reverse pre-emphasis.""" if self.preemphasis == 0: raise RuntimeError(" [!] Preemphasis is set 0.0.") return scipy.signal.lfilter([1], [1, -self.preemphasis], x) ### SPECTROGRAMs ### - def _linear_to_mel(self, spectrogram): + def _linear_to_mel(self, spectrogram: np.ndarray) -> np.ndarray: + """Project a full scale spectrogram to a melspectrogram. + + Args: + spectrogram (np.ndarray): Full scale spectrogram. + + Returns: + np.ndarray: Melspectrogram + """ return np.dot(self.mel_basis, spectrogram) - def _mel_to_linear(self, mel_spec): + def _mel_to_linear(self, mel_spec: np.ndarray) -> np.ndarray: + """Convert a melspectrogram to full scale spectrogram.""" return np.maximum(1e-10, np.dot(self.inv_mel_basis, mel_spec)) - def spectrogram(self, y): + def spectrogram(self, y: np.ndarray) -> np.ndarray: + """Compute a spectrogram from a waveform. + + Args: + y (np.ndarray): Waveform. + + Returns: + np.ndarray: Spectrogram. + """ if self.preemphasis != 0: D = self._stft(self.apply_preemphasis(y)) else: @@ -355,7 +451,8 @@ class AudioProcessor(object): S = self._amp_to_db(np.abs(D)) return self.normalize(S).astype(np.float32) - def melspectrogram(self, y): + def melspectrogram(self, y: np.ndarray) -> np.ndarray: + """Compute a melspectrogram from a waveform.""" if self.preemphasis != 0: D = self._stft(self.apply_preemphasis(y)) else: @@ -363,8 +460,8 @@ class AudioProcessor(object): S = self._amp_to_db(self._linear_to_mel(np.abs(D))) return self.normalize(S).astype(np.float32) - def inv_spectrogram(self, spectrogram): - """Converts spectrogram to waveform using librosa""" + def inv_spectrogram(self, spectrogram: np.ndarray) -> np.ndarray: + """Convert a spectrogram to a waveform using Griffi-Lim vocoder.""" S = self.denormalize(spectrogram) S = self._db_to_amp(S) # Reconstruct phase @@ -372,8 +469,8 @@ class AudioProcessor(object): return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power)) return self._griffin_lim(S ** self.power) - def inv_melspectrogram(self, mel_spectrogram): - """Converts melspectrogram to waveform using librosa""" + def inv_melspectrogram(self, mel_spectrogram: np.ndarray) -> np.ndarray: + """Convert a melspectrogram to a waveform using Griffi-Lim vocoder.""" D = self.denormalize(mel_spectrogram) S = self._db_to_amp(D) S = self._mel_to_linear(S) # Convert back to linear @@ -381,7 +478,15 @@ class AudioProcessor(object): return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power)) return self._griffin_lim(S ** self.power) - def out_linear_to_mel(self, linear_spec): + def out_linear_to_mel(self, linear_spec: np.ndarray) -> np.ndarray: + """Convert a full scale linear spectrogram output of a network to a melspectrogram. + + Args: + linear_spec (np.ndarray): Normalized full scale linear spectrogram. + + Returns: + np.ndarray: Normalized melspectrogram. + """ S = self.denormalize(linear_spec) S = self._db_to_amp(S) S = self._linear_to_mel(np.abs(S)) @@ -390,7 +495,15 @@ class AudioProcessor(object): return mel ### STFT and ISTFT ### - def _stft(self, y): + def _stft(self, y: np.ndarray) -> np.ndarray: + """Librosa STFT wrapper. + + Args: + y (np.ndarray): Audio signal. + + Returns: + np.ndarray: Complex number array. + """ return librosa.stft( y=y, n_fft=self.fft_size, @@ -401,7 +514,8 @@ class AudioProcessor(object): center=True, ) - def _istft(self, y): + def _istft(self, y: np.ndarray) -> np.ndarray: + """Librosa iSTFT wrapper.""" return librosa.istft(y, hop_length=self.hop_length, win_length=self.win_length) def _griffin_lim(self, S): @@ -414,7 +528,8 @@ class AudioProcessor(object): return y def compute_stft_paddings(self, x, pad_sides=1): - """compute right padding (final frame) or both sides padding (first and final frames)""" + """Compute paddings used by Librosa's STFT. Compute right padding (final frame) or both sides padding + (first and final frames)""" assert pad_sides in (1, 2) pad = (x.shape[0] // self.hop_length + 1) * self.hop_length - x.shape[0] if pad_sides == 1: @@ -434,7 +549,17 @@ class AudioProcessor(object): # return f0 ### Audio Processing ### - def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8): + def find_endpoint(self, wav: np.ndarray, threshold_db=-40, min_silence_sec=0.8) -> int: + """Find the last point without silence at the end of a audio signal. + + Args: + wav (np.ndarray): Audio signal. + threshold_db (int, optional): Silence threshold in decibels. Defaults to -40. + min_silence_sec (float, optional): Ignore silences that are shorter then this in secs. Defaults to 0.8. + + Returns: + int: Last point without silence. + """ window_length = int(self.sample_rate * min_silence_sec) hop_length = int(window_length / 4) threshold = self._db_to_amp(threshold_db) @@ -452,11 +577,28 @@ class AudioProcessor(object): ] @staticmethod - def sound_norm(x): + def sound_norm(x: np.ndarray) -> np.ndarray: + """Normalize the volume of an audio signal. + + Args: + x (np.ndarray): Raw waveform. + + Returns: + np.ndarray: Volume normalized waveform. + """ return x / abs(x).max() * 0.95 ### save and load ### - def load_wav(self, filename, sr=None): + def load_wav(self, filename: str, sr: int = None) -> np.ndarray: + """Read a wav file using Librosa and optionally resample, silence trim, volume normalize. + + Args: + filename (str): Path to the wav file. + sr (int, optional): Sampling rate for resampling. Defaults to None. + + Returns: + np.ndarray: Loaded waveform. + """ if self.resample: x, sr = librosa.load(filename, sr=self.sample_rate) elif sr is None: @@ -473,12 +615,19 @@ class AudioProcessor(object): x = self.sound_norm(x) return x - def save_wav(self, wav, path, sr=None): + def save_wav(self, wav: np.ndarray, path: str, sr: int = None) -> None: + """Save a waveform to a file using Scipy. + + Args: + wav (np.ndarray): Waveform to save. + path (str): Path to a output file. + sr (int, optional): Sampling rate used for saving to the file. Defaults to None. + """ wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm.astype(np.int16)) @staticmethod - def mulaw_encode(wav, qc): + def mulaw_encode(wav: np.ndarray, qc: int) -> np.ndarray: mu = 2 ** qc - 1 # wav_abs = np.minimum(np.abs(wav), 1.0) signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1.0 + mu) @@ -500,11 +649,21 @@ class AudioProcessor(object): return np.clip(x * 2 ** 15, -(2 ** 15), 2 ** 15 - 1).astype(np.int16) @staticmethod - def quantize(x, bits): + def quantize(x: np.ndarray, bits: int) -> np.ndarray: + """Quantize a waveform to a given number of bits. + + Args: + x (np.ndarray): Waveform to quantize. Must be normalized into the range `[-1, 1]`. + bits (int): Number of quantization bits. + + Returns: + np.ndarray: Quantized waveform. + """ return (x + 1.0) * (2 ** bits - 1) / 2 @staticmethod def dequantize(x, bits): + """Dequantize a waveform from the given number of bits.""" return 2 * x / (2 ** bits - 1) - 1 From 828e1265a6d84a24ed9144d184c6f6df202018b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 28 Jun 2021 16:54:08 +0200 Subject: [PATCH 231/258] Update torch hub --- hubconf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hubconf.py b/hubconf.py index bcbd6fce..96f12b5f 100644 --- a/hubconf.py +++ b/hubconf.py @@ -1,5 +1,5 @@ dependencies = [ - 'torch', 'gdown', 'pysbd', 'phonemizer', 'unidecode', 'pypinyin' + 'torch', 'gdown', 'pysbd', 'gruut', 'anyascii', 'pypinyin', 'coqpit', 'mecab-python3', 'unidic-lite` ] import torch From 47b3b10d6dae196ffd3a9bd9370cb8bba4d4713b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 29 Jun 2021 13:07:59 +0200 Subject: [PATCH 232/258] =?UTF-8?q?Bump=20up=20to=20v0.1.0=20=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TTS/VERSION | 2 +- tests/vocoder_tests/test_multiband_melgan_train.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/TTS/VERSION b/TTS/VERSION index e3b86dd9..6e8bf73a 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.0.16 +0.1.0 diff --git a/tests/vocoder_tests/test_multiband_melgan_train.py b/tests/vocoder_tests/test_multiband_melgan_train.py index daf2841b..c49107bd 100644 --- a/tests/vocoder_tests/test_multiband_melgan_train.py +++ b/tests/vocoder_tests/test_multiband_melgan_train.py @@ -20,7 +20,6 @@ config = MultibandMelganConfig( eval_split_size=1, print_step=1, print_eval=True, - discriminator_model_params={"base_channels": 16, "max_channels": 128, "downsample_factors": [4, 4, 4]}, data_path="tests/data/ljspeech", discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, output_path=output_path, From 4b5421b42f0adc335d0847531db39cf9e972516c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 29 Jun 2021 13:20:40 +0200 Subject: [PATCH 233/258] Remove FAQ link from README.md --- README.md | 3 +-- TTS/VERSION | 2 +- TTS/trainer.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 842a16d0..136fb4d8 100644 --- a/README.md +++ b/README.md @@ -25,10 +25,9 @@ Please use our dedicated channels for questions and discussion. Help is much mor | Type | Platforms | | ------------------------------- | --------------------------------------- | | 🚨 **Bug Reports** | [GitHub Issue Tracker] | -| ❔ **FAQ** | [TTS/Wiki](https://github.com/coqui-ai/TTS/wiki/FAQ) | | 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker] | | 👩‍💻 **Usage Questions** | [Github Discussions] | -| 🗯 **General Discussion** | [Github Discussions] or [Gitter Room]| +| 🗯 **General Discussion** | [Github Discussions] or [Gitter Room] | [github issue tracker]: https://github.com/coqui-ai/tts/issues [github discussions]: https://github.com/coqui-ai/TTS/discussions diff --git a/TTS/VERSION b/TTS/VERSION index 6e8bf73a..6c6aa7cb 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.1.0 +0.1.0 \ No newline at end of file diff --git a/TTS/trainer.py b/TTS/trainer.py index 0e921335..b28eceb5 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -1018,7 +1018,7 @@ def process_args(args, config=None): config = load_config(args.config_path) else: # init from console args - from TTS.config.shared_configs import BaseTrainingConfig + from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel config_base = BaseTrainingConfig() config_base.parse_known_args(coqpit_overrides) From 5723eb47383fb38b6caadf0f431403e99aeeff42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 29 Jun 2021 16:41:08 +0200 Subject: [PATCH 234/258] Fix config init in `process_args` --- TTS/trainer.py | 21 +++++++++++---------- recipes/ljspeech/hifigan/train_hifigan.py | 3 +-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/TTS/trainer.py b/TTS/trainer.py index b28eceb5..93efeef4 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -1012,17 +1012,18 @@ def process_args(args, config=None): args.restore_path, best_model = get_last_checkpoint(args.continue_path) if not args.best_path: args.best_path = best_model - # init config - if config is None and args.config_path: - # init from a file - config = load_config(args.config_path) - else: - # init from console args - from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel + # init config if not already defined + if config is None: + if args.config_path: + # init from a file + config = load_config(args.config_path) + else: + # init from console args + from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel - config_base = BaseTrainingConfig() - config_base.parse_known_args(coqpit_overrides) - config = register_config(config_base.model)() + config_base = BaseTrainingConfig() + config_base.parse_known_args(coqpit_overrides) + config = register_config(config_base.model)() # override values from command-line args config.parse_known_args(coqpit_overrides, relaxed_parser=True) if config.mixed_precision: diff --git a/recipes/ljspeech/hifigan/train_hifigan.py b/recipes/ljspeech/hifigan/train_hifigan.py index 99b39e99..af615ace 100644 --- a/recipes/ljspeech/hifigan/train_hifigan.py +++ b/recipes/ljspeech/hifigan/train_hifigan.py @@ -1,8 +1,7 @@ import os +from TTS.trainer import Trainer, TrainingArgs, init_training from TTS.vocoder.configs import HifiganConfig -from TTS.trainer import init_training, Trainer, TrainingArgs - output_path = os.path.dirname(os.path.abspath(__file__)) config = HifiganConfig( From 3584a04552b210bf852ad2d90a255d1f838f6fb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 30 Jun 2021 14:09:05 +0200 Subject: [PATCH 235/258] Move `main_classes` to a separate folder --- docs/source/main_classes/audio_processor.md | 25 +++++++++++++++++++++ docs/source/main_classes/dataset.md | 25 +++++++++++++++++++++ docs/source/main_classes/gan.md | 12 ++++++++++ docs/source/main_classes/model_api.md | 24 ++++++++++++++++++++ docs/source/main_classes/trainer_api.md | 17 ++++++++++++++ 5 files changed, 103 insertions(+) create mode 100644 docs/source/main_classes/audio_processor.md create mode 100644 docs/source/main_classes/dataset.md create mode 100644 docs/source/main_classes/gan.md create mode 100644 docs/source/main_classes/model_api.md create mode 100644 docs/source/main_classes/trainer_api.md diff --git a/docs/source/main_classes/audio_processor.md b/docs/source/main_classes/audio_processor.md new file mode 100644 index 00000000..1a7bf8ae --- /dev/null +++ b/docs/source/main_classes/audio_processor.md @@ -0,0 +1,25 @@ +# AudioProcessor + +`TTS.utils.audio.AudioProcessor` is the core class for all the audio processing routines. It provides an API for + +- Feature extraction. +- Sound normalization. +- Reading and writing audio files. +- Sampling audio signals. +- Normalizing and denormalizing audio signals. +- Griffin-Lim vocoder. + +The `AudioProcessor` needs to be initialized with `TTS.config.shared_configs.BaseAudioConfig`. Any model config +also must inherit or initiate `BaseAudioConfig`. + +## AudioProcessor +```{eval-rst} +.. autoclass:: TTS.utils.audio.AudioProcessor + :members: +``` + +## BaseAudioConfig +```{eval-rst} +.. autoclass:: TTS.config.shared_configs.BaseAudioConfig + :members: +``` \ No newline at end of file diff --git a/docs/source/main_classes/dataset.md b/docs/source/main_classes/dataset.md new file mode 100644 index 00000000..92d381ac --- /dev/null +++ b/docs/source/main_classes/dataset.md @@ -0,0 +1,25 @@ +# Datasets + +## TTS Dataset + +```{eval-rst} +.. autoclass:: TTS.tts.datasets.TTSDataset + :members: +``` + +## Vocoder Dataset + +```{eval-rst} +.. autoclass:: TTS.vocoder.datasets.gan_dataset.GANDataset + :members: +``` + +```{eval-rst} +.. autoclass:: TTS.vocoder.datasets.wavegrad_dataset.WaveGradDataset + :members: +``` + +```{eval-rst} +.. autoclass:: TTS.vocoder.datasets.wavernn_dataset.WaveRNNDataset + :members: +``` \ No newline at end of file diff --git a/docs/source/main_classes/gan.md b/docs/source/main_classes/gan.md new file mode 100644 index 00000000..4524b4b5 --- /dev/null +++ b/docs/source/main_classes/gan.md @@ -0,0 +1,12 @@ +# GAN API + +The {class}`TTS.vocoder.models.gan.GAN` provides an easy way to implementing new GAN based models. You just need +to define the model architectures for the generator and the discriminator networks and give them to the `GAN` class +to do its ✨️. + + +## GAN +```{eval-rst} +.. autoclass:: TTS.vocoder.models.gan.GAN + :members: +``` \ No newline at end of file diff --git a/docs/source/main_classes/model_api.md b/docs/source/main_classes/model_api.md new file mode 100644 index 00000000..438901b7 --- /dev/null +++ b/docs/source/main_classes/model_api.md @@ -0,0 +1,24 @@ +# Model API +Model API provides you a set of functions that easily make your model compatible with the `Trainer`, +`Synthesizer` and `ModelZoo`. + +## Base TTS Model + +```{eval-rst} +.. autoclass:: TTS.model.BaseModel + :members: +``` + +## Base `tts` Model + +```{eval-rst} +.. autoclass:: TTS.tts.models.base_tts.BaseTTS + :members: +``` + +## Base `vocoder` Model + +```{eval-rst} +.. autoclass:: TTS.tts.models.base_vocoder.BaseVocoder` + :members: +``` \ No newline at end of file diff --git a/docs/source/main_classes/trainer_api.md b/docs/source/main_classes/trainer_api.md new file mode 100644 index 00000000..a5c3cfb7 --- /dev/null +++ b/docs/source/main_classes/trainer_api.md @@ -0,0 +1,17 @@ +# Trainer API + +The {class}`TTS.trainer.Trainer` provides a lightweight, extensible, and feature-complete training run-time. We optimized it for 🐸 but +can also be used for any DL training in different domains. It supports distributed multi-gpu, mixed-precision (apex or torch.amp) training. + + +## Trainer +```{eval-rst} +.. autoclass:: TTS.trainer.Trainer + :members: +``` + +## TrainingArgs +```{eval-rst} +.. autoclass:: TTS.trainer.TrainingArgs + :members: +``` \ No newline at end of file From 21126839a86a9a0b328ea68ff96061d4077c65a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 30 Jun 2021 14:09:49 +0200 Subject: [PATCH 236/258] Update .gitignore --- .gitignore | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 3125f3f9..6f412def 100644 --- a/.gitignore +++ b/.gitignore @@ -140,11 +140,8 @@ events.out* old_configs/* model_importers/* model_profiling/* -<<<<<<< HEAD docs/source/TODO/* -======= -docs/* ->>>>>>> univnet +docs/source/models/* .noseids .dccache log.txt From 2e1a428b83350cd1fc402e544bce6f462572cb67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 30 Jun 2021 14:30:55 +0200 Subject: [PATCH 237/258] Update glowtts docstrings and docs --- CONTRIBUTING.md | 2 +- README.md | 2 +- TTS/trainer.py | 2 +- TTS/tts/configs/glow_tts_config.py | 2 +- TTS/tts/layers/glow_tts/decoder.py | 17 ++- TTS/tts/layers/glow_tts/duration_predictor.py | 22 ++-- TTS/tts/layers/glow_tts/encoder.py | 26 +++-- TTS/tts/layers/glow_tts/glow.py | 58 ++++++---- TTS/tts/layers/glow_tts/transformer.py | 49 ++++---- TTS/tts/models/glow_tts.py | 48 ++++---- TTS/utils/audio.py | 2 +- TTS/vocoder/models/gan.py | 109 +++++++++++++++++- TTS/vocoder/tf/layers/pqmf.py | 2 +- TTS/vocoder/tf/models/melgan_generator.py | 2 +- TTS/vocoder/utils/distribution.py | 2 +- docs/source/audio_processor.md | 25 ---- docs/source/conf.py | 57 +++++---- docs/source/converting_torch_to_tf.md | 2 +- docs/source/dataset.md | 25 ---- docs/source/faq.md | 2 +- docs/source/implementing_a_new_model.md | 2 +- docs/source/index.md | 27 +++-- docs/source/main_classes/audio_processor.md | 2 +- docs/source/main_classes/model_api.md | 2 +- docs/source/model_api.md | 24 ---- docs/source/trainer_api.md | 17 --- 26 files changed, 305 insertions(+), 225 deletions(-) delete mode 100644 docs/source/audio_processor.md delete mode 100644 docs/source/dataset.md delete mode 100644 docs/source/model_api.md delete mode 100644 docs/source/trainer_api.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2a9620a7..831eddd5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,7 +2,7 @@ Welcome to the 🐸TTS! -This repository is governed by the Contributor Covenant Code of Conduct - [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md). +This repository is governed by [the Contributor Covenant Code of Conduct](https://github.com/coqui-ai/TTS/blob/main/CODE_OF_CONDUCT.md). ## Where to start. We welcome everyone who likes to contribute to 🐸TTS. diff --git a/README.md b/README.md index 136fb4d8..ee7f91f2 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ 🐸TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality. 🐸TTS comes with [pretrained models](https://github.com/coqui-ai/TTS/wiki/Released-Models), tools for measuring dataset quality and already used in **20+ languages** for products and research projects. -[![CircleCI](https://github.com/coqui-ai/TTS/actions/workflows/main.yml/badge.svg)]() +[![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/main.yml/badge.svg)](https://github.com/coqui-ai/TTS/actions) [![License]()](https://opensource.org/licenses/MPL-2.0) [![Docs]()](https://tts.readthedocs.io/en/latest/) [![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS) diff --git a/TTS/trainer.py b/TTS/trainer.py index 93efeef4..c56be140 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -985,7 +985,7 @@ def get_last_checkpoint(path): def process_args(args, config=None): - """Process parsed comand line arguments. + """Process parsed comand line arguments and initialize the config if not provided. Args: args (argparse.Namespace or dict like): Parsed input arguments. diff --git a/TTS/tts/configs/glow_tts_config.py b/TTS/tts/configs/glow_tts_config.py index 19b7abd9..cfa7cde2 100644 --- a/TTS/tts/configs/glow_tts_config.py +++ b/TTS/tts/configs/glow_tts_config.py @@ -7,7 +7,7 @@ from TTS.tts.configs.shared_configs import BaseTTSConfig class GlowTTSConfig(BaseTTSConfig): """Defines parameters for GlowTTS model. - Example: + Example: >>> from TTS.tts.configs import GlowTTSConfig >>> config = GlowTTSConfig() diff --git a/TTS/tts/layers/glow_tts/decoder.py b/TTS/tts/layers/glow_tts/decoder.py index 7b3f0ed1..f57c3731 100644 --- a/TTS/tts/layers/glow_tts/decoder.py +++ b/TTS/tts/layers/glow_tts/decoder.py @@ -12,7 +12,8 @@ def squeeze(x, x_mask=None, num_sqz=2): Note: each 's' is a n-dimensional vector. - [s1,s2,s3,s4,s5,s6] --> [[s1, s3, s5], [s2, s4, s6]]""" + ``[s1,s2,s3,s4,s5,s6] --> [[s1, s3, s5], [s2, s4, s6]]`` + """ b, c, t = x.size() t = (t // num_sqz) * num_sqz @@ -32,7 +33,8 @@ def unsqueeze(x, x_mask=None, num_sqz=2): Note: each 's' is a n-dimensional vector. - [[s1, s3, s5], [s2, s4, s6]] --> [[s1, s3, s5], [s2, s4, s6]]""" + ``[[s1, s3, s5], [s2, s4, s6]] --> [[s1, s3, s5], [s2, s4, s6]]`` + """ b, c, t = x.size() x_unsqz = x.view(b, num_sqz, c // num_sqz, t) @@ -47,7 +49,10 @@ def unsqueeze(x, x_mask=None, num_sqz=2): class Decoder(nn.Module): """Stack of Glow Decoder Modules. - Squeeze -> ActNorm -> InvertibleConv1x1 -> AffineCoupling -> Unsqueeze + + :: + + Squeeze -> ActNorm -> InvertibleConv1x1 -> AffineCoupling -> Unsqueeze Args: in_channels (int): channels of input tensor. @@ -106,6 +111,12 @@ class Decoder(nn.Module): ) def forward(self, x, x_mask, g=None, reverse=False): + """ + Shapes: + - x: :math:`[B, C, T]` + - x_mask: :math:`[B, 1 ,T]` + - g: :math:`[B, C]` + """ if not reverse: flows = self.flows logdet_tot = 0 diff --git a/TTS/tts/layers/glow_tts/duration_predictor.py b/TTS/tts/layers/glow_tts/duration_predictor.py index 51d1066a..e35aeb68 100644 --- a/TTS/tts/layers/glow_tts/duration_predictor.py +++ b/TTS/tts/layers/glow_tts/duration_predictor.py @@ -6,13 +6,16 @@ from ..generic.normalization import LayerNorm class DurationPredictor(nn.Module): """Glow-TTS duration prediction model. - [2 x (conv1d_kxk -> relu -> layer_norm -> dropout)] -> conv1d_1x1 -> durs - Args: - in_channels ([type]): [description] - hidden_channels ([type]): [description] - kernel_size ([type]): [description] - dropout_p ([type]): [description] + :: + + [2 x (conv1d_kxk -> relu -> layer_norm -> dropout)] -> conv1d_1x1 -> durs + + Args: + in_channels (int): Number of channels of the input tensor. + hidden_channels (int): Number of hidden channels of the network. + kernel_size (int): Kernel size for the conv layers. + dropout_p (float): Dropout rate used after each conv layer. """ def __init__(self, in_channels, hidden_channels, kernel_size, dropout_p): @@ -34,11 +37,8 @@ class DurationPredictor(nn.Module): def forward(self, x, x_mask): """ Shapes: - x: [B, C, T] - x_mask: [B, 1, T] - - Returns: - [type]: [description] + - x: :math:`[B, C, T]` + - x_mask: :math:`[B, 1, T]` """ x = self.conv_1(x * x_mask) x = torch.relu(x) diff --git a/TTS/tts/layers/glow_tts/encoder.py b/TTS/tts/layers/glow_tts/encoder.py index 71aee94f..f3eb4655 100644 --- a/TTS/tts/layers/glow_tts/encoder.py +++ b/TTS/tts/layers/glow_tts/encoder.py @@ -15,13 +15,16 @@ from TTS.tts.utils.data import sequence_mask class Encoder(nn.Module): """Glow-TTS encoder module. - embedding -> -> encoder_module -> --> proj_mean - | - |-> proj_var - | - |-> concat -> duration_predictor - ↑ - speaker_embed + :: + + embedding -> -> encoder_module -> --> proj_mean + | + |-> proj_var + | + |-> concat -> duration_predictor + ↑ + speaker_embed + Args: num_chars (int): number of characters. out_channels (int): number of output channels. @@ -36,7 +39,8 @@ class Encoder(nn.Module): Shapes: - input: (B, T, C) - Notes: + :: + suggested encoder params... for encoder_type == 'rel_pos_transformer' @@ -139,9 +143,9 @@ class Encoder(nn.Module): def forward(self, x, x_lengths, g=None): """ Shapes: - x: [B, C, T] - x_lengths: [B] - g (optional): [B, 1, T] + - x: :math:`[B, C, T]` + - x_lengths: :math:`[B]` + - g (optional): :math:`[B, 1, T]` """ # embedding layer # [B ,T, D] diff --git a/TTS/tts/layers/glow_tts/glow.py b/TTS/tts/layers/glow_tts/glow.py index 7620ef88..33036537 100644 --- a/TTS/tts/layers/glow_tts/glow.py +++ b/TTS/tts/layers/glow_tts/glow.py @@ -10,21 +10,24 @@ from ..generic.normalization import LayerNorm class ResidualConv1dLayerNormBlock(nn.Module): + """Conv1d with Layer Normalization and residual connection as in GlowTTS paper. + https://arxiv.org/pdf/1811.00002.pdf + + :: + + x |-> conv1d -> layer_norm -> relu -> dropout -> + -> o + |---------------> conv1d_1x1 -----------------------| + + Args: + in_channels (int): number of input tensor channels. + hidden_channels (int): number of inner layer channels. + out_channels (int): number of output tensor channels. + kernel_size (int): kernel size of conv1d filter. + num_layers (int): number of blocks. + dropout_p (float): dropout rate for each block. + """ + def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, num_layers, dropout_p): - """Conv1d with Layer Normalization and residual connection as in GlowTTS paper. - https://arxiv.org/pdf/1811.00002.pdf - - x |-> conv1d -> layer_norm -> relu -> dropout -> + -> o - |---------------> conv1d_1x1 -----------------------| - - Args: - in_channels (int): number of input tensor channels. - hidden_channels (int): number of inner layer channels. - out_channels (int): number of output tensor channels. - kernel_size (int): kernel size of conv1d filter. - num_layers (int): number of blocks. - dropout_p (float): dropout rate for each block. - """ super().__init__() self.in_channels = in_channels self.hidden_channels = hidden_channels @@ -51,6 +54,11 @@ class ResidualConv1dLayerNormBlock(nn.Module): self.proj.bias.data.zero_() def forward(self, x, x_mask): + """ + Shapes: + - x: :math:`[B, C, T]` + - x_mask: :math:`[B, 1, T]` + """ x_res = x for i in range(self.num_layers): x = self.conv_layers[i](x * x_mask) @@ -95,8 +103,8 @@ class InvConvNear(nn.Module): def forward(self, x, x_mask=None, reverse=False, **kwargs): # pylint: disable=unused-argument """ Shapes: - x: B x C x T - x_mask: B x 1 x T + - x: :math:`[B, C, T]` + - x_mask: :math:`[B, 1, T]` """ b, c, t = x.size() @@ -139,10 +147,12 @@ class CouplingBlock(nn.Module): """Glow Affine Coupling block as in GlowTTS paper. https://arxiv.org/pdf/1811.00002.pdf - x --> x0 -> conv1d -> wavenet -> conv1d --> t, s -> concat(s*x1 + t, x0) -> o - '-> x1 - - - - - - - - - - - - - - - - - - - - - - - - - ^ + :: - Args: + x --> x0 -> conv1d -> wavenet -> conv1d --> t, s -> concat(s*x1 + t, x0) -> o + '-> x1 - - - - - - - - - - - - - - - - - - - - - - - - - ^ + + Args: in_channels (int): number of input tensor channels. hidden_channels (int): number of hidden channels. kernel_size (int): WaveNet filter kernel size. @@ -152,8 +162,8 @@ class CouplingBlock(nn.Module): dropout_p (int): wavenet dropout rate. sigmoid_scale (bool): enable/disable sigmoid scaling for output scale. - Note: - It does not use conditional inputs differently from WaveGlow. + Note: + It does not use the conditional inputs differently from WaveGlow. """ def __init__( @@ -193,9 +203,9 @@ class CouplingBlock(nn.Module): def forward(self, x, x_mask=None, reverse=False, g=None, **kwargs): # pylint: disable=unused-argument """ Shapes: - x: B x C x T - x_mask: B x 1 x T - g: B x C x 1 + - x: :math:`[B, C, T]` + - x_mask: :math:`[B, 1, T]` + - g: :math:`[B, C, 1]` """ if x_mask is None: x_mask = 1 diff --git a/TTS/tts/layers/glow_tts/transformer.py b/TTS/tts/layers/glow_tts/transformer.py index 1a67d0ba..92cace78 100644 --- a/TTS/tts/layers/glow_tts/transformer.py +++ b/TTS/tts/layers/glow_tts/transformer.py @@ -17,16 +17,18 @@ class RelativePositionMultiHeadAttention(nn.Module): Note: Example with relative attention window size 2 - input = [a, b, c, d, e] - rel_attn_embeddings = [e(t-2), e(t-1), e(t+1), e(t+2)] + + - input = [a, b, c, d, e] + - rel_attn_embeddings = [e(t-2), e(t-1), e(t+1), e(t+2)] So it learns 4 embedding vectors (in total 8) separately for key and value vectors. Considering the input c - e(t-2) corresponds to c -> a - e(t-2) corresponds to c -> b - e(t-2) corresponds to c -> d - e(t-2) corresponds to c -> e + + - e(t-2) corresponds to c -> a + - e(t-2) corresponds to c -> b + - e(t-2) corresponds to c -> d + - e(t-2) corresponds to c -> e These embeddings are shared among different time steps. So input a, b, d and e also uses the same embeddings. @@ -106,6 +108,12 @@ class RelativePositionMultiHeadAttention(nn.Module): nn.init.xavier_uniform_(self.conv_v.weight) def forward(self, x, c, attn_mask=None): + """ + Shapes: + - x: :math:`[B, C, T]` + - c: :math:`[B, C, T]` + - attn_mask: :math:`[B, 1, T, T]` + """ q = self.conv_q(x) k = self.conv_k(c) v = self.conv_v(c) @@ -163,9 +171,9 @@ class RelativePositionMultiHeadAttention(nn.Module): re (Tensor): relative value embedding vector. (a_(i,j)^V) Shapes: - p_attn: [B, H, T, V] - re: [H or 1, V, D] - logits: [B, H, T, D] + -p_attn: :math:`[B, H, T, V]` + -re: :math:`[H or 1, V, D]` + -logits: :math:`[B, H, T, D]` """ logits = torch.matmul(p_attn, re.unsqueeze(0)) return logits @@ -178,9 +186,9 @@ class RelativePositionMultiHeadAttention(nn.Module): re (Tensor): relative key embedding vector. (a_(i,j)^K) Shapes: - query: [B, H, T, D] - re: [H or 1, V, D] - logits: [B, H, T, V] + - query: :math:`[B, H, T, D]` + - re: :math:`[H or 1, V, D]` + - logits: :math:`[B, H, T, V]` """ # logits = torch.einsum('bhld, kmd -> bhlm', [query, re.to(query.dtype)]) logits = torch.matmul(query, re.unsqueeze(0).transpose(-2, -1)) @@ -202,10 +210,10 @@ class RelativePositionMultiHeadAttention(nn.Module): @staticmethod def _relative_position_to_absolute_position(x): """Converts tensor from relative to absolute indexing for local attention. - Args: - x: [B, D, length, 2 * length - 1] + Shapes: + x: :math:`[B, C, T, 2 * T - 1]` Returns: - A Tensor of shape [B, D, length, length] + A Tensor of shape :math:`[B, C, T, T]` """ batch, heads, length, _ = x.size() # Pad to shift from relative to absolute indexing. @@ -220,8 +228,9 @@ class RelativePositionMultiHeadAttention(nn.Module): @staticmethod def _absolute_position_to_relative_position(x): """ - x: [B, H, T, T] - ret: [B, H, T, 2*T-1] + Shapes: + - x: :math:`[B, C, T, T]` + - ret: :math:`[B, C, T, 2*T-1]` """ batch, heads, length, _ = x.size() # padd along column @@ -239,7 +248,7 @@ class RelativePositionMultiHeadAttention(nn.Module): Args: length (int): an integer scalar. Returns: - a Tensor with shape [1, 1, length, length] + a Tensor with shape :math:`[1, 1, T, T]` """ # L r = torch.arange(length, dtype=torch.float32) @@ -362,8 +371,8 @@ class RelativePositionTransformer(nn.Module): def forward(self, x, x_mask): """ Shapes: - x: [B, C, T] - x_mask: [B, 1, T] + - x: :math:`[B, C, T]` + - x_mask: :math:`[B, 1, T]` """ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) for i in range(self.num_layers): diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index ca2682dc..5f966c2c 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -30,24 +30,31 @@ class GlowTTS(BaseTTS): the autoregressive model, Tacotron 2, at synthesis with comparable speech quality. We further show that our model can be easily extended to a multi-speaker setting. - Check `GlowTTSConfig` for class arguments. + Check :class:`TTS.tts.configs.glow_tts_config.GlowTTSConfig` for class arguments. + + Examples: + >>> from TTS.tts.configs import GlowTTSConfig + >>> from TTS.tts.models.glow_tts import GlowTTS + >>> config = GlowTTSConfig() + >>> model = GlowTTS(config) + """ def __init__(self, config: GlowTTSConfig): super().__init__() - chars, self.config = self.get_characters(config) - self.num_chars = len(chars) - self.decoder_output_dim = config.out_channels - self.init_multispeaker(config) - # pass all config fields to `self` # for fewer code change self.config = config for key in config: setattr(self, key, config[key]) + chars, self.config = self.get_characters(config) + self.num_chars = len(chars) + self.decoder_output_dim = config.out_channels + self.init_multispeaker(config) + # if is a multispeaker and c_in_channels is 0, set to 256 self.c_in_channels = 0 if self.num_speakers > 1: @@ -91,7 +98,7 @@ class GlowTTS(BaseTTS): @staticmethod def compute_outputs(attn, o_mean, o_log_scale, x_mask): - # compute final values with the computed alignment + """ Compute and format the mode outputs with the given alignment map""" y_mean = torch.matmul(attn.squeeze(1).transpose(1, 2), o_mean.transpose(1, 2)).transpose( 1, 2 ) # [b, t', t], [b, t, d] -> [b, d, t'] @@ -107,11 +114,11 @@ class GlowTTS(BaseTTS): ): # pylint: disable=dangerous-default-value """ Shapes: - x: [B, T] - x_lenghts: B - y: [B, T, C] - y_lengths: B - g: [B, C] or B + - x: :math:`[B, T]` + - x_lenghts::math:` B` + - y: :math:`[B, T, C]` + - y_lengths::math:` B` + - g: :math:`[B, C] or B` """ y = y.transpose(1, 2) y_max_length = y.size(2) @@ -161,12 +168,13 @@ class GlowTTS(BaseTTS): """ It's similar to the teacher forcing in Tacotron. It was proposed in: https://arxiv.org/abs/2104.05557 + Shapes: - x: [B, T] - x_lenghts: B - y: [B, T, C] - y_lengths: B - g: [B, C] or B + - x: :math:`[B, T]` + - x_lenghts: :math:`B` + - y: :math:`[B, T, C]` + - y_lengths: :math:`B` + - g: :math:`[B, C] or B` """ y = y.transpose(1, 2) y_max_length = y.size(2) @@ -221,9 +229,9 @@ class GlowTTS(BaseTTS): ): # pylint: disable=dangerous-default-value """ Shapes: - y: [B, T, C] - y_lengths: B - g: [B, C] or B + - y: :math:`[B, T, C]` + - y_lengths: :math:`B` + - g: :math:`[B, C] or B` """ y = y.transpose(1, 2) y_max_length = y.size(2) diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index 3706b4ec..27b52bef 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -54,7 +54,7 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method Tensor: spectrogram frames. Shapes: - x: [B x T] or [B x 1 x T] + x: [B x T] or [:math:`[B, 1, T]`] """ if x.ndim == 2: x = x.unsqueeze(1) diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py index 94583147..39176155 100644 --- a/TTS/vocoder/models/gan.py +++ b/TTS/vocoder/models/gan.py @@ -22,6 +22,9 @@ class GAN(BaseVocoder): """Wrap a generator and a discriminator network. It provides a compatible interface for the trainer. It also helps mixing and matching different generator and disciminator networks easily. + To implement a new GAN models, you just need to define the generator and the discriminator networks, the rest + is handled by the `GAN` class. + Args: config (Coqpit): Model configuration. @@ -39,12 +42,41 @@ class GAN(BaseVocoder): self.y_hat_g = None # the last generator prediction to be passed onto the discriminator def forward(self, x: torch.Tensor) -> torch.Tensor: + """Run the generator's forward pass. + + Args: + x (torch.Tensor): Input tensor. + + Returns: + torch.Tensor: output of the GAN generator network. + """ return self.model_g.forward(x) def inference(self, x: torch.Tensor) -> torch.Tensor: + """Run the generator's inference pass. + + Args: + x (torch.Tensor): Input tensor. + Returns: + torch.Tensor: output of the GAN generator network. + """ return self.model_g.inference(x) def train_step(self, batch: Dict, criterion: Dict, optimizer_idx: int) -> Tuple[Dict, Dict]: + """Compute model outputs and the loss values. `optimizer_idx` selects the generator or the discriminator for + network on the current pass. + + Args: + batch (Dict): Batch of samples returned by the dataloader. + criterion (Dict): Criterion used to compute the losses. + optimizer_idx (int): ID of the optimizer in use on the current pass. + + Raises: + ValueError: `optimizer_idx` is an unexpected value. + + Returns: + Tuple[Dict, Dict]: model outputs and the computed loss values. + """ outputs = None loss_dict = None @@ -145,7 +177,18 @@ class GAN(BaseVocoder): return outputs, loss_dict @staticmethod - def _log(name: str, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + def _log(name: str, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, Dict]: + """Logging shared by the training and evaluation. + + Args: + name (str): Name of the run. `train` or `eval`, + ap (AudioProcessor): Audio processor used in training. + batch (Dict): Batch used in the last train/eval step. + outputs (Dict): Model outputs from the last train/eval step. + + Returns: + Tuple[Dict, Dict]: log figures and audio samples. + """ y_hat = outputs[0]["model_outputs"] y = batch["waveform"] figures = plot_results(y_hat, y, ap, name) @@ -154,13 +197,16 @@ class GAN(BaseVocoder): return figures, audios def train_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + """Call `_log()` for training.""" return self._log("train", ap, batch, outputs) @torch.no_grad() def eval_step(self, batch: Dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]: + """Call `train_step()` with `no_grad()`""" return self.train_step(batch, criterion, optimizer_idx) def eval_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + """Call `_log()` for evaluation.""" return self._log("eval", ap, batch, outputs) def load_checkpoint( @@ -169,6 +215,13 @@ class GAN(BaseVocoder): checkpoint_path: str, eval: bool = False, # pylint: disable=unused-argument, redefined-builtin ) -> None: + """Load a GAN checkpoint and initialize model parameters. + + Args: + config (Coqpit): Model config. + checkpoint_path (str): Checkpoint file path. + eval (bool, optional): If true, load the model for inference. If falseDefaults to False. + """ state = torch.load(checkpoint_path, map_location=torch.device("cpu")) # band-aid for older than v0.0.15 GAN models if "model_disc" in state: @@ -181,9 +234,21 @@ class GAN(BaseVocoder): self.model_g.remove_weight_norm() def on_train_step_start(self, trainer) -> None: + """Enable the discriminator training based on `steps_to_start_discriminator` + + Args: + trainer (Trainer): Trainer object. + """ self.train_disc = trainer.total_steps_done >= self.config.steps_to_start_discriminator - def get_optimizer(self): + def get_optimizer(self) -> List: + """Initiate and return the GAN optimizers based on the config parameters. + + It returnes 2 optimizers in a list. First one is for the generator and the second one is for the discriminator. + + Returns: + List: optimizers. + """ optimizer1 = get_optimizer( self.config.optimizer, self.config.optimizer_params, self.config.lr_gen, self.model_g ) @@ -192,16 +257,37 @@ class GAN(BaseVocoder): ) return [optimizer1, optimizer2] - def get_lr(self): + def get_lr(self) -> List: + """Set the initial learning rates for each optimizer. + + Returns: + List: learning rates for each optimizer. + """ return [self.config.lr_gen, self.config.lr_disc] - def get_scheduler(self, optimizer): + def get_scheduler(self, optimizer) -> List: + """Set the schedulers for each optimizer. + + Args: + optimizer (List[`torch.optim.Optimizer`]): List of optimizers. + + Returns: + List: Schedulers, one for each optimizer. + """ scheduler1 = get_scheduler(self.config.lr_scheduler_gen, self.config.lr_scheduler_gen_params, optimizer[0]) scheduler2 = get_scheduler(self.config.lr_scheduler_disc, self.config.lr_scheduler_disc_params, optimizer[1]) return [scheduler1, scheduler2] @staticmethod - def format_batch(batch): + def format_batch(batch: List) -> Dict: + """Format the batch for training. + + Args: + batch (List): Batch out of the dataloader. + + Returns: + Dict: formatted model inputs. + """ if isinstance(batch[0], list): x_G, y_G = batch[0] x_D, y_D = batch[1] @@ -218,6 +304,19 @@ class GAN(BaseVocoder): verbose: bool, num_gpus: int, ): + """Initiate and return the GAN dataloader. + + Args: + config (Coqpit): Model config. + ap (AudioProcessor): Audio processor. + is_eval (True): Set the dataloader for evaluation if true. + data_items (List): Data samples. + verbose (bool): Log information if true. + num_gpus (int): Number of GPUs in use. + + Returns: + DataLoader: Torch dataloader. + """ dataset = GANDataset( ap=ap, items=data_items, diff --git a/TTS/vocoder/tf/layers/pqmf.py b/TTS/vocoder/tf/layers/pqmf.py index 81b666b9..042f2f08 100644 --- a/TTS/vocoder/tf/layers/pqmf.py +++ b/TTS/vocoder/tf/layers/pqmf.py @@ -34,7 +34,7 @@ class PQMF(tf.keras.layers.Layer): def analysis(self, x): """ - x : B x 1 x T + x : :math:`[B, 1, T]` """ x = tf.transpose(x, perm=[0, 2, 1]) x = tf.pad(x, [[0, 0], [self.taps // 2, self.taps // 2], [0, 0]], constant_values=0.0) diff --git a/TTS/vocoder/tf/models/melgan_generator.py b/TTS/vocoder/tf/models/melgan_generator.py index 205a240e..09ee9530 100644 --- a/TTS/vocoder/tf/models/melgan_generator.py +++ b/TTS/vocoder/tf/models/melgan_generator.py @@ -92,7 +92,7 @@ class MelganGenerator(tf.keras.models.Model): @tf.function(experimental_relax_shapes=True) def call(self, c, training=False): """ - c : B x C x T + c : :math:`[B, C, T]` """ if training: raise NotImplementedError() diff --git a/TTS/vocoder/utils/distribution.py b/TTS/vocoder/utils/distribution.py index 43d0d884..fe706ba9 100644 --- a/TTS/vocoder/utils/distribution.py +++ b/TTS/vocoder/utils/distribution.py @@ -113,7 +113,7 @@ def sample_from_discretized_mix_logistic(y, log_scale_min=None): """ Sample from discretized mixture of logistic distributions Args: - y (Tensor): B x C x T + y (Tensor): :math:`[B, C, T]` log_scale_min (float): Log scale minimum value Returns: Tensor: sample in range of [-1, 1]. diff --git a/docs/source/audio_processor.md b/docs/source/audio_processor.md deleted file mode 100644 index 1a7bf8ae..00000000 --- a/docs/source/audio_processor.md +++ /dev/null @@ -1,25 +0,0 @@ -# AudioProcessor - -`TTS.utils.audio.AudioProcessor` is the core class for all the audio processing routines. It provides an API for - -- Feature extraction. -- Sound normalization. -- Reading and writing audio files. -- Sampling audio signals. -- Normalizing and denormalizing audio signals. -- Griffin-Lim vocoder. - -The `AudioProcessor` needs to be initialized with `TTS.config.shared_configs.BaseAudioConfig`. Any model config -also must inherit or initiate `BaseAudioConfig`. - -## AudioProcessor -```{eval-rst} -.. autoclass:: TTS.utils.audio.AudioProcessor - :members: -``` - -## BaseAudioConfig -```{eval-rst} -.. autoclass:: TTS.config.shared_configs.BaseAudioConfig - :members: -``` \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index 87c91d96..d11f4bef 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -50,6 +50,43 @@ exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'TODO/*'] source_suffix = [".rst", ".md"] +# extensions +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.doctest', + 'sphinx.ext.intersphinx', + 'sphinx.ext.todo', + 'sphinx.ext.coverage', + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', + 'sphinx.ext.autosectionlabel', + 'myst_parser', + "sphinx_copybutton", + "sphinx_inline_tabs", +] + +# 'sphinxcontrib.katex', +# 'sphinx.ext.autosectionlabel', + + +# autosectionlabel throws warnings if section names are duplicated. +# The following tells autosectionlabel to not throw a warning for +# duplicated section names that are in different documents. +autosectionlabel_prefix_document = True + +language = None + +autodoc_inherit_docstrings = False + +# Disable displaying type annotations, these can be very verbose +autodoc_typehints = 'none' + +# Enable overriding of function signatures in the first line of the docstring. +autodoc_docstring_signature = True + +napoleon_custom_sections = [('Shapes', 'shape')] + # -- Options for HTML output ------------------------------------------------- @@ -80,23 +117,3 @@ html_sidebars = { # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] - - -# using markdown -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx.ext.doctest', - 'sphinx.ext.intersphinx', - 'sphinx.ext.todo', - 'sphinx.ext.coverage', - 'sphinx.ext.napoleon', - 'sphinx.ext.viewcode', - 'sphinx.ext.autosectionlabel', - 'myst_parser', - "sphinx_copybutton", - "sphinx_inline_tabs", -] - -# 'sphinxcontrib.katex', -# 'sphinx.ext.autosectionlabel', diff --git a/docs/source/converting_torch_to_tf.md b/docs/source/converting_torch_to_tf.md index 6b992eb0..20a0be6b 100644 --- a/docs/source/converting_torch_to_tf.md +++ b/docs/source/converting_torch_to_tf.md @@ -1,4 +1,4 @@ -# Converting Torch Tacotron to TF 2 +# Converting Torch to TF 2 Currently, 🐸TTS supports the vanilla Tacotron2 and MelGAN models in TF 2.It does not support advanced attention methods and other small tricks used by the Torch models. You can convert any Torch model trained after v0.0.2. diff --git a/docs/source/dataset.md b/docs/source/dataset.md deleted file mode 100644 index 92d381ac..00000000 --- a/docs/source/dataset.md +++ /dev/null @@ -1,25 +0,0 @@ -# Datasets - -## TTS Dataset - -```{eval-rst} -.. autoclass:: TTS.tts.datasets.TTSDataset - :members: -``` - -## Vocoder Dataset - -```{eval-rst} -.. autoclass:: TTS.vocoder.datasets.gan_dataset.GANDataset - :members: -``` - -```{eval-rst} -.. autoclass:: TTS.vocoder.datasets.wavegrad_dataset.WaveGradDataset - :members: -``` - -```{eval-rst} -.. autoclass:: TTS.vocoder.datasets.wavernn_dataset.WaveRNNDataset - :members: -``` \ No newline at end of file diff --git a/docs/source/faq.md b/docs/source/faq.md index 6f5de6d8..4dbaab13 100644 --- a/docs/source/faq.md +++ b/docs/source/faq.md @@ -105,7 +105,7 @@ The best approach is to pick a set of promising models and run a Mean-Opinion-Sc - Check the 4th step under "How can I check model performance?" ## How can I test a trained model? -- The best way is to use `tts` or `tts-server` commands. For details check {ref}`here `. +- The best way is to use `tts` or `tts-server` commands. For details check {ref}`here `. - If you need to code your own ```TTS.utils.synthesizer.Synthesizer``` class. ## My Tacotron model does not stop - I see "Decoder stopped with 'max_decoder_steps" - Stopnet does not work. diff --git a/docs/source/implementing_a_new_model.md b/docs/source/implementing_a_new_model.md index 5a9aaae7..c0043bf1 100644 --- a/docs/source/implementing_a_new_model.md +++ b/docs/source/implementing_a_new_model.md @@ -36,7 +36,7 @@ There is also the `callback` interface by which you can manipulate both the model and the `Trainer` states. Callbacks give you the infinite flexibility to add custom behaviours for your model and training routines. - For more details, see {ref}`BaseTTS ` and `TTS/utils/callbacks.py`. + For more details, see {ref}`BaseTTS ` and :obj:`TTS.utils.callbacks`. 6. Optionally, define `MyModelArgs`. diff --git a/docs/source/index.md b/docs/source/index.md index 82792fee..ec79ecfd 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -2,7 +2,6 @@ ```{include} ../../README.md :relative-images: ``` - ---- # Documentation Content @@ -27,14 +26,28 @@ formatting_your_dataset what_makes_a_good_dataset tts_datasets + converting_torch_to_tf .. toctree:: :maxdepth: 2 :caption: Main Classes - trainer_api - audio_processor - model_api - configuration - dataset -``` \ No newline at end of file + main_classes/trainer_api + main_classes/audio_processor + main_classes/model_api + main_classes/dataset + main_classes/gan + +.. toctree:: + :maxdepth: 2 + :caption: `tts` Models + + models/glow_tts.md + +.. toctree:: + :maxdepth: 2 + :caption: `vocoder` Models + + main_classes/gan +``` + diff --git a/docs/source/main_classes/audio_processor.md b/docs/source/main_classes/audio_processor.md index 1a7bf8ae..600b0db5 100644 --- a/docs/source/main_classes/audio_processor.md +++ b/docs/source/main_classes/audio_processor.md @@ -1,4 +1,4 @@ -# AudioProcessor +# AudioProcessor API `TTS.utils.audio.AudioProcessor` is the core class for all the audio processing routines. It provides an API for diff --git a/docs/source/main_classes/model_api.md b/docs/source/main_classes/model_api.md index 438901b7..6781a268 100644 --- a/docs/source/main_classes/model_api.md +++ b/docs/source/main_classes/model_api.md @@ -19,6 +19,6 @@ Model API provides you a set of functions that easily make your model compatible ## Base `vocoder` Model ```{eval-rst} -.. autoclass:: TTS.tts.models.base_vocoder.BaseVocoder` +.. autoclass:: TTS.vocoder.models.base_vocoder.BaseVocoder :members: ``` \ No newline at end of file diff --git a/docs/source/model_api.md b/docs/source/model_api.md deleted file mode 100644 index 438901b7..00000000 --- a/docs/source/model_api.md +++ /dev/null @@ -1,24 +0,0 @@ -# Model API -Model API provides you a set of functions that easily make your model compatible with the `Trainer`, -`Synthesizer` and `ModelZoo`. - -## Base TTS Model - -```{eval-rst} -.. autoclass:: TTS.model.BaseModel - :members: -``` - -## Base `tts` Model - -```{eval-rst} -.. autoclass:: TTS.tts.models.base_tts.BaseTTS - :members: -``` - -## Base `vocoder` Model - -```{eval-rst} -.. autoclass:: TTS.tts.models.base_vocoder.BaseVocoder` - :members: -``` \ No newline at end of file diff --git a/docs/source/trainer_api.md b/docs/source/trainer_api.md deleted file mode 100644 index a5c3cfb7..00000000 --- a/docs/source/trainer_api.md +++ /dev/null @@ -1,17 +0,0 @@ -# Trainer API - -The {class}`TTS.trainer.Trainer` provides a lightweight, extensible, and feature-complete training run-time. We optimized it for 🐸 but -can also be used for any DL training in different domains. It supports distributed multi-gpu, mixed-precision (apex or torch.amp) training. - - -## Trainer -```{eval-rst} -.. autoclass:: TTS.trainer.Trainer - :members: -``` - -## TrainingArgs -```{eval-rst} -.. autoclass:: TTS.trainer.TrainingArgs - :members: -``` \ No newline at end of file From 8cb16da82ab2862b7e696ca341c342640980f9b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 30 Jun 2021 15:00:14 +0200 Subject: [PATCH 238/258] Fix readthedocs build --- docs/source/readthedocs.yml => .readthedocs.yml | 7 ++++--- docs/Makefile | 2 +- docs/source/conf.py | 6 ++++-- docs/source/index.md | 1 - 4 files changed, 9 insertions(+), 7 deletions(-) rename docs/source/readthedocs.yml => .readthedocs.yml (72%) diff --git a/docs/source/readthedocs.yml b/.readthedocs.yml similarity index 72% rename from docs/source/readthedocs.yml rename to .readthedocs.yml index 59eed1f7..946d363c 100644 --- a/docs/source/readthedocs.yml +++ b/.readthedocs.yml @@ -8,10 +8,11 @@ version: 2 # Build documentation in the docs/ directory with Sphinx sphinx: builder: html - configuration: docs/conf.py + configuration: docs/source/conf.py # Optionally set the version of Python and requirements required to build your docs python: - version: 3.8 + version: 3.7 install: - - requirements: doc/requirements.txt \ No newline at end of file + - requirements: docs/requirements.txt + - requirements: requirements.txt \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile index 92dd33a1..b1d20a99 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -3,7 +3,7 @@ # You can set these variables from the command line, and also # from the environment for the first two. -SPHINXOPTS ?= +SPHINXOPTS ?= -j auto -WT --keep-going SPHINXBUILD ?= sphinx-build SOURCEDIR = source BUILDDIR = _build diff --git a/docs/source/conf.py b/docs/source/conf.py index d11f4bef..5831fcdb 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -13,8 +13,10 @@ import os import sys -sys.path.insert(0, os.path.abspath('../../TTS')) -autodoc_mock_imports = ["tts"] +sys.path.insert(0, os.path.abspath('../..')) + +# mock deps with system level requirements. +autodoc_mock_imports = ["soundfile"] # -- Project information ----------------------------------------------------- project = 'TTS' diff --git a/docs/source/index.md b/docs/source/index.md index ec79ecfd..ec32c303 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -48,6 +48,5 @@ :maxdepth: 2 :caption: `vocoder` Models - main_classes/gan ``` From db47f4f105457164d00b7a2507daca5aca027deb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 2 Jul 2021 10:43:00 +0200 Subject: [PATCH 239/258] Update `.models.json` --- TTS/.models.json | 47 +++++++++++++++++------------------------------ 1 file changed, 17 insertions(+), 30 deletions(-) diff --git a/TTS/.models.json b/TTS/.models.json index 310dc5f0..b8d8d4f7 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -4,10 +4,9 @@ "ek1":{ "tacotron2": { "description": "EK1 en-rp tacotron2 by NMStoker", - "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.10/tts_models--en--ek1--tacotron2.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.0/tts_models--en--ek1--tacotron2.zip", "default_vocoder": "vocoder_models/en/ek1/wavegrad", - "commit": "c802255", - "needs_phonemizer": true + "commit": "c802255" } }, "ljspeech":{ @@ -18,8 +17,7 @@ "commit": "bae2ad0f", "author": "Eren Gölge @erogol", "license": "", - "contact":"egolge@coqui.com", - "needs_phonemizer": false + "contact":"egolge@coqui.com" }, "glow-tts":{ "description": "", @@ -29,8 +27,7 @@ "commit": "", "author": "Eren Gölge @erogol", "license": "MPL", - "contact":"egolge@coqui.com", - "needs_phonemizer": true + "contact":"egolge@coqui.com" }, "tacotron2-DCA": { "description": "", @@ -39,30 +36,27 @@ "commit": "", "author": "Eren Gölge @erogol", "license": "MPL", - "contact":"egolge@coqui.com", - "needs_phonemizer": true + "contact":"egolge@coqui.com" }, "speedy-speech-wn":{ "description": "Speedy Speech model with wavenet decoder.", - "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.10/tts_models--en--ljspeech--speedy-speech-wn.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.0/tts_models--en--ljspeech--speedy-speech-wn.zip", "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan", "commit": "77b6145", "author": "Eren Gölge @erogol", "license": "MPL", - "contact":"egolge@coqui.com", - "needs_phonemizer": true + "contact":"egolge@coqui.com" } }, "vctk":{ "sc-glow-tts": { "description": "Multi-Speaker Transformers based SC-Glow model from https://arxiv.org/abs/2104.05557.", - "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.12/tts_models--en--vctk--sc-glowtts-transformer.zip", - "default_vocoder": null, + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.0/tts_models--en--vctk--sc-glowtts-transformer.zip", + "default_vocoder": "vocoder_models/en/vctk/hifigan_v2", "commit": "b531fa69", "author": "Edresson Casanova", "license": "", - "contact":"", - "needs_phonemizer": true + "contact":"" } @@ -75,8 +69,7 @@ "commit": "bae2ad0f", "author": "Eren Gölge @erogol", "license": "", - "contact":"egolge@coqui.com", - "needs_phonemizer": true + "contact":"egolge@coqui.com" } } }, @@ -88,8 +81,7 @@ "commit": "", "author": "Eren Gölge @erogol", "license": "MPL", - "contact":"egolge@coqui.com", - "needs_phonemizer": true + "contact":"egolge@coqui.com" } } }, @@ -101,8 +93,7 @@ "commit": "", "author": "Eren Gölge @erogol", "license": "MPL", - "contact":"egolge@coqui.com", - "needs_phonemizer": true + "contact":"egolge@coqui.com" } } }, @@ -122,8 +113,7 @@ "author": "@r-dh", "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan", "stats_file": null, - "commit": "540d811", - "needs_phonemizer": true + "commit": "540d811" } } }, @@ -134,8 +124,7 @@ "author": "@erogol", "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan", "license":"", - "contact": "egolge@coqui.com", - "needs_phonemizer": true + "contact": "egolge@coqui.com" } } }, @@ -145,8 +134,7 @@ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.11/tts_models--de--thorsten--tacotron2-DCA.zip", "default_vocoder": "vocoder_models/de/thorsten/wavegrad", "author": "@thorstenMueller", - "commit": "unknown", - "needs_phonemizer": true + "commit": "unknown" } } }, @@ -157,8 +145,7 @@ "default_vocoder": "vocoder_models/universal/libri-tts/wavegrad", "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.", "author": "@kaiidams", - "commit": "401fbd89", - "needs_phonemizer": false + "commit": "401fbd89" } } } From a4c658f5ef64a35f0d725344594cda9cc2e04597 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 2 Jul 2021 10:43:38 +0200 Subject: [PATCH 240/258] Fix for using the `Synthesizer` out of the model --- TTS/bin/synthesize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 3cde5612..9895c04e 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -239,7 +239,7 @@ def main(): print( " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." ) - print(synthesizer.speaker_manager.speaker_ids) + print(synthesizer.tts_model.speaker_manager.speaker_ids) return # check the arguments against a multi-speaker model. From 0fa6a8c9b8c3e4174a53b95d033620b7f1c54918 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 2 Jul 2021 10:44:23 +0200 Subject: [PATCH 241/258] Fix glow tts default parameters --- TTS/tts/configs/glow_tts_config.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/TTS/tts/configs/glow_tts_config.py b/TTS/tts/configs/glow_tts_config.py index cfa7cde2..caf2f71b 100644 --- a/TTS/tts/configs/glow_tts_config.py +++ b/TTS/tts/configs/glow_tts_config.py @@ -41,7 +41,7 @@ class GlowTTSConfig(BaseTTSConfig): kernel_size_dec (int): Decoder kernel size. Defaults to 5 dilation_rate (int): - Rate to increase dilation by each layer in a decoder block. Defaults to 5. + Rate to increase dilation by each layer in a decoder block. Defaults to 1. num_block_layers (int): Number of decoder layers in each decoder block. Defaults to 4. dropout_p_dec (float): @@ -54,7 +54,7 @@ class GlowTTSConfig(BaseTTSConfig): Number of split levels in inversible conv1x1 operation. Defaults to 4. num_squeeze (int): Number of squeeze levels. When squeezing channels increases and time steps reduces by the factor - 'num_squeeze'. Defaults to 1. + 'num_squeeze'. Defaults to 2. sigmoid_scale (bool): enable/disable sigmoid scaling in decoder. Defaults to False. mean_only (bool): @@ -74,6 +74,8 @@ class GlowTTSConfig(BaseTTSConfig): Path to the wav file used for changing the style of the speech. Defaults to None. inference_noise_scale (float): Variance used for sampling the random noise added to the decoder's input at inference. Defaults to 0.0. + length_scale (float): + Multiply the predicted durations with this value to change the speech speed. Defaults to 1. use_speaker_embedding (bool): enable / disable using speaker embeddings for multi-speaker models. If set True, the model is in the multi-speaker mode. Defaults to False. @@ -120,14 +122,13 @@ class GlowTTSConfig(BaseTTSConfig): num_flow_blocks_dec: int = 12 inference_noise_scale: float = 0.33 kernel_size_dec: int = 5 - dilation_rate: int = 5 + dilation_rate: int = 1 num_block_layers: int = 4 num_speakers: int = 0 c_in_channels: int = 0 num_splits: int = 4 - num_squeeze: int = 1 + num_squeeze: int = 2 sigmoid_scale: bool = False - mean_only: bool = False encoder_type: str = "rel_pos_transformer" encoder_params: dict = field( default_factory=lambda: { @@ -147,6 +148,7 @@ class GlowTTSConfig(BaseTTSConfig): # inference params style_wav_for_test: str = None inference_noise_scale: float = 0.0 + length_scale: float = 1.0 # multi-speaker settings use_speaker_embedding: bool = False From 40b0b5365e319c4a0af420d5be4146e8be312f64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 2 Jul 2021 10:45:00 +0200 Subject: [PATCH 242/258] Let `get_characters` return `num_chars` --- TTS/tts/models/base_tts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 015d0200..2ec268d6 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -41,7 +41,8 @@ class BaseTTS(BaseModel): config.characters = parse_symbols() model_characters = phonemes if config.use_phonemes else symbols - return model_characters, config + num_chars = len(model_characters) + getattr(config, "add_blank", False) + return model_characters, config, num_chars def get_speaker_manager(config: Coqpit, restore_path: str, data: List, out_path: str = None) -> SpeakerManager: return get_speaker_manager(config, restore_path, data, out_path) From 95ad72f38feb7d839c5bfe51409a713a78565edf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 2 Jul 2021 10:45:37 +0200 Subject: [PATCH 243/258] Fix glow tts initialization --- TTS/tts/models/glow_tts.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 5f966c2c..d7406c73 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -11,6 +11,7 @@ from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path from TTS.tts.models.base_tts import BaseTTS from TTS.tts.utils.data import sequence_mask from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.speakers import get_speaker_manager from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor @@ -50,9 +51,9 @@ class GlowTTS(BaseTTS): for key in config: setattr(self, key, config[key]) - chars, self.config = self.get_characters(config) - self.num_chars = len(chars) + chars, self.config, self.num_chars = self.get_characters(config) self.decoder_output_dim = config.out_channels + self.init_multispeaker(config) # if is a multispeaker and c_in_channels is 0, set to 256 @@ -91,9 +92,23 @@ class GlowTTS(BaseTTS): c_in_channels=self.c_in_channels, ) - if self.num_speakers > 1 and not self.d_vector_dim: - # speaker embedding layer - self.emb_g = nn.Embedding(self.num_speakers, self.c_in_channels) + def init_multispeaker(self, config: "Coqpit", data: list = None) -> None: + """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer + or with external `d_vectors` computed from a speaker encoder model. + + If you need a different behaviour, override this function for your model. + + Args: + config (Coqpit): Model configuration. + data (List, optional): Dataset items to infer number of speakers. Defaults to None. + """ + # init speaker manager + self.speaker_manager = get_speaker_manager(config, data=data) + self.num_speakers = self.speaker_manager.num_speakers + # init speaker embedding layer + if config.use_speaker_embedding and not config.use_d_vector_file: + self.embedded_speaker_dim = self.c_in_channels + self.emb_g = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) @staticmethod @@ -260,6 +275,7 @@ class GlowTTS(BaseTTS): def inference(self, x, aux_input={"x_lengths": None, "d_vectors": None}): # pylint: disable=dangerous-default-value x_lengths = aux_input["x_lengths"] g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None + if g is not None: if self.d_vector_dim: g = F.normalize(g).unsqueeze(-1) From 9352cb413673918d3704944c6bf584dac2a58f1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 2 Jul 2021 10:45:58 +0200 Subject: [PATCH 244/258] Format Align TTS docstrings --- TTS/tts/models/align_tts.py | 50 ++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index dbd57b83..3d52e5e2 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -73,13 +73,7 @@ class AlignTTS(BaseTTS): Encoder -> DurationPredictor -> Decoder - Check ```AlignTTSArgs``` for the class arguments. - - Examples: - >>> from TTS.tts.configs import AlignTTSConfig - >>> config = AlignTTSConfig() - >>> config.model_args.num_chars = 50 - >>> model = AlignTTS(config) + Check :class:`AlignTTSArgs` for the class arguments. Paper Abstract: Targeting at both high efficiency and performance, we propose AlignTTS to predict the @@ -99,6 +93,11 @@ class AlignTTS(BaseTTS): Original model uses Transormers in encoder and decoder layers. However, here you can set the architecture differently based on your requirements using ```encoder_type``` and ```decoder_type``` parameters. + Examples: + >>> from TTS.tts.configs import AlignTTSConfig + >>> config = AlignTTSConfig() + >>> model = AlignTTS(config) + """ # pylint: disable=dangerous-default-value @@ -113,6 +112,11 @@ class AlignTTS(BaseTTS): if isinstance(config.model_args.length_scale, int) else config.model_args.length_scale ) + + if not self.config.model_args.num_chars: + chars, self.config, num_chars = self.get_characters(config) + self.config.model_args.num_chars = num_chars + self.emb = nn.Embedding(self.config.model_args.num_chars, self.config.model_args.hidden_channels) self.embedded_speaker_dim = 0 @@ -173,15 +177,15 @@ class AlignTTS(BaseTTS): """Generate attention alignment map from durations and expand encoder outputs - Example: - encoder output: [a,b,c,d] - durations: [1, 3, 2, 1] + Examples:: + - encoder output: [a,b,c,d] + - durations: [1, 3, 2, 1] - expanded: [a, b, b, b, c, c, d] - attention map: [[0, 0, 0, 0, 0, 0, 1], - [0, 0, 0, 0, 1, 1, 0], - [0, 1, 1, 1, 0, 0, 0], - [1, 0, 0, 0, 0, 0, 0]] + - expanded: [a, b, b, b, c, c, d] + - attention map: [[0, 0, 0, 0, 0, 0, 1], + [0, 0, 0, 0, 1, 1, 0], + [0, 1, 1, 1, 0, 0, 0], + [1, 0, 0, 0, 0, 0, 0]] """ attn = self.convert_dr_to_align(dr, x_mask, y_mask) o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2), en.transpose(1, 2)).transpose(1, 2) @@ -257,11 +261,11 @@ class AlignTTS(BaseTTS): ): # pylint: disable=unused-argument """ Shapes: - x: [B, T_max] - x_lengths: [B] - y_lengths: [B] - dr: [B, T_max] - g: [B, C] + - x: :math:`[B, T_max]` + - x_lengths: :math:`[B]` + - y_lengths: :math:`[B]` + - dr: :math:`[B, T_max]` + - g: :math:`[B, C]` """ y = y.transpose(1, 2) g = aux_input["d_vectors"] if "d_vectors" in aux_input else None @@ -311,9 +315,9 @@ class AlignTTS(BaseTTS): def inference(self, x, aux_input={"d_vectors": None}): # pylint: disable=unused-argument """ Shapes: - x: [B, T_max] - x_lengths: [B] - g: [B, C] + - x: :math:`[B, T_max]` + - x_lengths: :math:`[B]` + - g: :math:`[B, C]` """ g = aux_input["d_vectors"] if "d_vectors" in aux_input else None x_lengths = torch.tensor(x.shape[1:2]).to(x.device) From 196876feb12f36d52a523c3fb177a0f9edf69793 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 2 Jul 2021 10:47:05 +0200 Subject: [PATCH 245/258] Fix `ModelManager` model download --- TTS/tts/models/speedy_speech.py | 3 +-- TTS/utils/manage.py | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index 2eb70a6b..f12ee8f6 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -98,8 +98,7 @@ class SpeedySpeech(BaseTTS): self.config = config if "characters" in config: - chars, self.config = self.get_characters(config) - self.num_chars = len(chars) + chars, self.config, self.num_chars = self.get_characters(config) self.length_scale = ( float(config.model_args.length_scale) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 93497517..d5e8d410 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -3,7 +3,7 @@ import json import os import zipfile from pathlib import Path -from shutil import copyfile +from shutil import copyfile, rmtree import gdown import requests @@ -83,7 +83,7 @@ class ModelManager(object): 'type/language/dataset/model' e.g. 'tts_model/en/ljspeech/tacotron' - Every model must have the following files + Every model must have the following files: - *.pth.tar : pytorch model checkpoint file. - config.json : model config file. - scale_stats.npy (if exist): scale values for preprocessing. @@ -101,11 +101,7 @@ class ModelManager(object): output_path = os.path.join(self.output_prefix, model_full_name) output_model_path = os.path.join(output_path, "model_file.pth.tar") output_config_path = os.path.join(output_path, "config.json") - # NOTE : band-aid for removing phoneme support - # if "needs_phonemizer" in model_item and model_item["needs_phonemizer"]: - # raise RuntimeError( - # " [!] Use 🐸TTS <= v0.0.13 for this model. Current version does not support phoneme based models." - # ) + if os.path.exists(output_path): print(f" > {model_name} is already downloaded.") else: @@ -116,7 +112,6 @@ class ModelManager(object): # download files to the output path if self._check_dict_key(model_item, "github_rls_url"): # download from github release - # TODO: pass output_path self._download_zip_file(model_item["github_rls_url"], output_path) else: # download from gdrive @@ -146,15 +141,20 @@ class ModelManager(object): gdown.download(f"{self.url_prefix}{gdrive_idx}", output=output, quiet=False) @staticmethod - def _download_zip_file(file_url, output): + def _download_zip_file(file_url, output_folder): """Download the github releases""" + # download the file r = requests.get(file_url) + # extract the file with zipfile.ZipFile(io.BytesIO(r.content)) as z: - z.extractall(output) + z.extractall(output_folder) + # move the files to the outer path for file_path in z.namelist()[1:]: - src_path = os.path.join(output, file_path) - dst_path = os.path.join(output, os.path.basename(file_path)) + src_path = os.path.join(output_folder, file_path) + dst_path = os.path.join(output_folder, os.path.basename(file_path)) copyfile(src_path, dst_path) + # remove the extracted folder + rmtree(os.path.join(output_folder, z.namelist()[0])) @staticmethod def _check_dict_key(my_dict, key): From 168f97cbe932c4c5461c67854b27f19c7c5b1962 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 2 Jul 2021 10:47:55 +0200 Subject: [PATCH 246/258] Let `Synthesizer` use the speaker manager out of the model --- TTS/utils/synthesizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 365ab8bd..56a8c9b2 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -195,7 +195,7 @@ class Synthesizer(object): if self.tts_speakers_file: # get the speaker embedding from the saved d_vectors. if speaker_idx and isinstance(speaker_idx, str): - speaker_embedding = self.speaker_manager.get_d_vectors_by_speaker(speaker_idx)[0] + speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(speaker_idx)[0] elif not speaker_idx and not speaker_wav: raise ValueError( " [!] Look like you use a multi-speaker model. " From 9e7824fe35a38b273d19b294b465688e4ae28b5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 2 Jul 2021 10:48:34 +0200 Subject: [PATCH 247/258] Fix UnivNet inference code --- TTS/vocoder/models/hifigan_generator.py | 4 ++-- TTS/vocoder/models/univnet_generator.py | 24 ++++++++---------------- recipes/ljspeech/univnet/train.py | 1 + 3 files changed, 11 insertions(+), 18 deletions(-) diff --git a/TTS/vocoder/models/hifigan_generator.py b/TTS/vocoder/models/hifigan_generator.py index 8d595a63..f606c649 100644 --- a/TTS/vocoder/models/hifigan_generator.py +++ b/TTS/vocoder/models/hifigan_generator.py @@ -15,7 +15,7 @@ def get_padding(k, d): class ResBlock1(torch.nn.Module): """Residual Block Type 1. It has 3 convolutional layers in each convolutiona block. - Network: + Network:: x -> lrelu -> conv1_1 -> conv1_2 -> conv1_3 -> z -> lrelu -> conv2_1 -> conv2_2 -> conv2_3 -> o -> + -> o |--------------------------------------------------------------------------------------------------| @@ -105,7 +105,7 @@ class ResBlock1(torch.nn.Module): class ResBlock2(torch.nn.Module): """Residual Block Type 1. It has 3 convolutional layers in each convolutiona block. - Network: + Network:: x -> lrelu -> conv1-> -> z -> lrelu -> conv2-> o -> + -> o |---------------------------------------------------| diff --git a/TTS/vocoder/models/univnet_generator.py b/TTS/vocoder/models/univnet_generator.py index 4604abb2..0a6bd4c8 100644 --- a/TTS/vocoder/models/univnet_generator.py +++ b/TTS/vocoder/models/univnet_generator.py @@ -122,24 +122,16 @@ class UnivnetGenerator(torch.nn.Module): """Return receptive field size.""" return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size) - def inference(self, c=None, x=None): + @torch.no_grad() + def inference(self, c): """Perform inference. Args: - c (Union[Tensor, ndarray]): Local conditioning auxiliary features (T' ,C). - x (Union[Tensor, ndarray]): Input noise signal (T, 1). + c (Tensor): Local conditioning auxiliary features :math:`(B, C, T)`. Returns: Tensor: Output tensor (T, out_channels) """ - if x is not None: - if not isinstance(x, torch.Tensor): - x = torch.tensor(x, dtype=torch.float).to(next(self.parameters()).device) - x = x.transpose(1, 0).unsqueeze(0) - else: - assert c is not None - x = torch.randn(1, 1, len(c) * self.upsample_factor).to(next(self.parameters()).device) - if c is not None: - if not isinstance(c, torch.Tensor): - c = torch.tensor(c, dtype=torch.float).to(next(self.parameters()).device) - c = c.transpose(1, 0).unsqueeze(0) - c = torch.nn.ReplicationPad1d(self.aux_context_window)(c) - return self.forward(c).squeeze(0).transpose(1, 0) + x = torch.randn([c.shape[0], self.in_channels, c.shape[2]]) + x = x.to(self.first_conv.bias.device) + + c = c.to(next(self.parameters())) + return self.forward(c) diff --git a/recipes/ljspeech/univnet/train.py b/recipes/ljspeech/univnet/train.py index d8f33ae3..a442b451 100644 --- a/recipes/ljspeech/univnet/train.py +++ b/recipes/ljspeech/univnet/train.py @@ -1,5 +1,6 @@ import os +from TTS.config.shared_configs import BaseAudioConfig from TTS.trainer import Trainer, TrainingArgs, init_training from TTS.vocoder.configs import UnivnetConfig From f382e4c700729645f15b255412a61e20d7a06503 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sat, 3 Jul 2021 13:30:24 +0200 Subject: [PATCH 248/258] Fix linter warnings --- Makefile | 4 ---- TTS/tts/models/align_tts.py | 2 +- TTS/tts/models/glow_tts.py | 2 +- TTS/tts/models/speedy_speech.py | 2 +- 4 files changed, 3 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 5606e4d5..c7815f19 100644 --- a/Makefile +++ b/Makefile @@ -6,10 +6,6 @@ help: target_dirs := tests TTS notebooks -test_all: ## run tests and don't stop on an error. - nosetests --with-cov -cov --cover-erase --cover-package TTS tests --nologcapture --with-id - ./run_bash_tests.sh - test_all: ## run tests and don't stop on an error. nosetests --with-cov -cov --cover-erase --cover-package TTS tests --nologcapture --with-id ./run_bash_tests.sh diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 3d52e5e2..879ecae4 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -114,7 +114,7 @@ class AlignTTS(BaseTTS): ) if not self.config.model_args.num_chars: - chars, self.config, num_chars = self.get_characters(config) + _, self.config, num_chars = self.get_characters(config) self.config.model_args.num_chars = num_chars self.emb = nn.Embedding(self.config.model_args.num_chars, self.config.model_args.hidden_channels) diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index d7406c73..9f235fad 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -51,7 +51,7 @@ class GlowTTS(BaseTTS): for key in config: setattr(self, key, config[key]) - chars, self.config, self.num_chars = self.get_characters(config) + _, self.config, self.num_chars = self.get_characters(config) self.decoder_output_dim = config.out_channels self.init_multispeaker(config) diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index f12ee8f6..8f14d610 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -98,7 +98,7 @@ class SpeedySpeech(BaseTTS): self.config = config if "characters" in config: - chars, self.config, self.num_chars = self.get_characters(config) + _, self.config, self.num_chars = self.get_characters(config) self.length_scale = ( float(config.model_args.length_scale) From c25a2184e72b0bc28f4ff8f8886cc01d6f7f5c5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sat, 3 Jul 2021 13:55:27 +0200 Subject: [PATCH 249/258] Add docs for `SpeakerManager` --- TTS/tts/utils/speakers.py | 55 +++++++++++---------- docs/source/index.md | 1 + docs/source/main_classes/speaker_manager.md | 11 +++++ 3 files changed, 41 insertions(+), 26 deletions(-) create mode 100644 docs/source/main_classes/speaker_manager.md diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 5caa2fee..8febcbbf 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -13,33 +13,27 @@ from TTS.utils.audio import AudioProcessor class SpeakerManager: - """It manages the multi-speaker setup for 🐸TTS models. It loads the speaker files and parses the information - in a way that you can query. There are 3 different scenarios considered. + """Manage the speakers for multi-speaker 🐸TTS models. Load a datafile and parse the information + in a way that can be queried by speaker or clip. - 1. Models using speaker embedding layers. The metafile only includes a mapping of speaker names to ids. - 2. Models using external embedding vectors (x vectors). The metafile includes a dictionary in the following - format. + There are 3 different scenarios considered: - ``` - { - 'clip_name.wav':{ - 'name': 'speakerA', - 'embedding'[] - }, - ... - } - ``` + 1. Models using speaker embedding layers. The datafile only maps speaker names to ids used by the embedding layer. + 2. Models using d-vectors. The datafile includes a dictionary in the following format. - 3. Computing x vectors at inference with the speaker encoder. It loads the speaker encoder model and - computes x vectors for a given instance. + :: - >>> >>> # load audio processor and speaker encoder - >>> ap = AudioProcessor(**config.audio) - >>> manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path) - >>> # load a sample audio and compute embedding - >>> waveform = ap.load_wav(sample_wav_path) - >>> mel = ap.melspectrogram(waveform) - >>> d_vector = manager.compute_d_vector(mel.T) + { + 'clip_name.wav':{ + 'name': 'speakerA', + 'embedding'[] + }, + ... + } + + + 3. Computing the d-vectors by the speaker encoder. It loads the speaker encoder model and + computes the d-vectors for a given clip or speaker. Args: d_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "". @@ -47,6 +41,15 @@ class SpeakerManager: TTS models. Defaults to "". encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "". encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "". + + Examples: + >>> # load audio processor and speaker encoder + >>> ap = AudioProcessor(**config.audio) + >>> manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path) + >>> # load a sample audio and compute embedding + >>> waveform = ap.load_wav(sample_wav_path) + >>> mel = ap.melspectrogram(waveform) + >>> d_vector = manager.compute_d_vector(mel.T) """ def __init__( @@ -188,7 +191,7 @@ class SpeakerManager: Args: speaker_idx (str): Target speaker ID. num_samples (int, optional): Number of samples to be averaged. Defaults to None. - randomize (bool, optional): Pick random `num_samples`of d_vectors. Defaults to False. + randomize (bool, optional): Pick random `num_samples` of d_vectors. Defaults to False. Returns: np.ndarray: Mean d_vector. @@ -311,7 +314,7 @@ def save_speaker_mapping(out_path, speaker_mapping): def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, out_path: str = None) -> SpeakerManager: - """Create a SpeakerManager instance based on provided configuration. + """Initiate a `SpeakerManager` instance by the provided config. Args: c (Coqpit): Model configuration. @@ -321,7 +324,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, out_path (str, optional): Save the generated speaker IDs to a output path. Defaults to None. Returns: - SpeakerManager: + SpeakerManager: initialized and ready to use instance. """ speaker_manager = SpeakerManager() if c.use_speaker_embedding: diff --git a/docs/source/index.md b/docs/source/index.md index ec32c303..001265fa 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -37,6 +37,7 @@ main_classes/model_api main_classes/dataset main_classes/gan + main_classes/speaker_manager .. toctree:: :maxdepth: 2 diff --git a/docs/source/main_classes/speaker_manager.md b/docs/source/main_classes/speaker_manager.md new file mode 100644 index 00000000..ba4b55dc --- /dev/null +++ b/docs/source/main_classes/speaker_manager.md @@ -0,0 +1,11 @@ +# Speaker Manager API + +The {class}`TTS.tts.utils.speakers.SpeakerManager` organize speaker related data and information for 🐸TTS models. It is +especially useful for multi-speaker models. + + +## Speaker Manager +```{eval-rst} +.. automodule:: TTS.tts.utils.speakers + :members: +``` \ No newline at end of file From d00c4c517a2ba911de60f159c2c28b08308a776a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 4 Jul 2021 11:09:40 +0200 Subject: [PATCH 250/258] Fix #607 --- MANIFEST.in | 1 + Makefile | 3 +++ 2 files changed, 4 insertions(+) diff --git a/MANIFEST.in b/MANIFEST.in index 861cb5a7..0d8b4b4c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,7 @@ include README.md include LICENSE.txt include requirements.*.txt +include requirements.txt include TTS/VERSION recursive-include TTS *.json recursive-include TTS *.html diff --git a/Makefile b/Makefile index c7815f19..d3d7dd41 100644 --- a/Makefile +++ b/Makefile @@ -34,6 +34,9 @@ dev-deps: ## install development deps doc-deps: ## install docs dependencies pip install -r docs/requirements.txt +build-docs: ## build the docs + cd docs && make clean && make build + hub-deps: ## install deps for torch hub use pip install -r requirements.hub.txt From 270c3823ebd2c0602a7dcfe92c4886b1afe62e00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 4 Jul 2021 11:19:31 +0200 Subject: [PATCH 251/258] Fix #608 --- TTS/.models.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/.models.json b/TTS/.models.json index b8d8d4f7..73204db6 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -51,7 +51,7 @@ "vctk":{ "sc-glow-tts": { "description": "Multi-Speaker Transformers based SC-Glow model from https://arxiv.org/abs/2104.05557.", - "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.0/tts_models--en--vctk--sc-glowtts-transformer.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.0/tts_models--en--vctk--sc-glow-tts.zip", "default_vocoder": "vocoder_models/en/vctk/hifigan_v2", "commit": "b531fa69", "author": "Edresson Casanova", From a05b234080bbf87db1499d63778e81564bd55319 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 4 Jul 2021 11:25:49 +0200 Subject: [PATCH 252/258] Raise an error when multiple GPUs are in use User must define the target GPU by `CUDA_VISIBLE_DEVICES` and use `distribute.py` for multi-gpu training. --- TTS/utils/trainer_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/TTS/utils/trainer_utils.py b/TTS/utils/trainer_utils.py index 02e68905..29915527 100644 --- a/TTS/utils/trainer_utils.py +++ b/TTS/utils/trainer_utils.py @@ -11,11 +11,15 @@ def is_apex_available(): def setup_torch_training_env(cudnn_enable, cudnn_benchmark): + num_gpus = torch.cuda.device_count() + if num_gpus > 1: + raise RuntimeError( + f" [!] {num_gpus} active GPUs. Define the target GPU by `CUDA_VISIBLE_DEVICES`. For multi-gpu training use `TTS/bin/distribute.py`." + ) torch.backends.cudnn.enabled = cudnn_enable torch.backends.cudnn.benchmark = cudnn_benchmark torch.manual_seed(54321) use_cuda = torch.cuda.is_available() - num_gpus = torch.cuda.device_count() print(" > Using CUDA: ", use_cuda) print(" > Number of GPUs: ", num_gpus) return use_cuda, num_gpus From 1e9538aaef41cdca831b15240913f7e3387c88f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 4 Jul 2021 11:45:49 +0200 Subject: [PATCH 253/258] Add more model tests to `test_synthesize` --- tests/inference_tests/test_synthesize.py | 30 ++++++++++++------------ 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/tests/inference_tests/test_synthesize.py b/tests/inference_tests/test_synthesize.py index 62eb6dbe..526f7dc8 100644 --- a/tests/inference_tests/test_synthesize.py +++ b/tests/inference_tests/test_synthesize.py @@ -10,19 +10,19 @@ def test_synthesize(): # single speaker model run_cli(f'tts --text "This is an example." --out_path "{output_path}"') - # run_cli( - # "tts --model_name tts_models/en/ljspeech/speedy-speech-wn " - # f'--text "This is an example." --out_path "{output_path}"' - # ) - # run_cli( - # "tts --model_name tts_models/en/ljspeech/speedy-speech-wn " - # "--vocoder_name vocoder_models/en/ljspeech/multiband-melgan " - # f'--text "This is an example." --out_path "{output_path}"' - # ) + run_cli( + "tts --model_name tts_models/en/ljspeech/speedy-speech-wn " + f'--text "This is an example." --out_path "{output_path}"' + ) + run_cli( + "tts --model_name tts_models/en/ljspeech/speedy-speech-wn " + "--vocoder_name vocoder_models/en/ljspeech/multiband-melgan " + f'--text "This is an example." --out_path "{output_path}"' + ) - # # multi-speaker model - # run_cli("tts --model_name tts_models/en/vctk/sc-glow-tts --list_speaker_idxs") - # run_cli( - # f'tts --model_name tts_models/en/vctk/sc-glow-tts --speaker_idx "p304" ' - # f'--text "This is an example." --out_path "{output_path}"' - # ) + # multi-speaker model + run_cli("tts --model_name tts_models/en/vctk/sc-glow-tts --list_speaker_idxs") + run_cli( + f'tts --model_name tts_models/en/vctk/sc-glow-tts --speaker_idx "p304" ' + f'--text "This is an example." --out_path "{output_path}"' + ) From 0c347624e7e28897db2720d2db68185cc28f82a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sun, 4 Jul 2021 11:46:36 +0200 Subject: [PATCH 254/258] Bump up version to v0.1.1 --- TTS/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/VERSION b/TTS/VERSION index 6c6aa7cb..6da28dde 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.1.0 \ No newline at end of file +0.1.1 \ No newline at end of file From 3c0454490f048f9f32b201c5efa12b4af57c02c4 Mon Sep 17 00:00:00 2001 From: eren golge Date: Tue, 6 Jul 2021 11:05:05 +0200 Subject: [PATCH 255/258] Fix #616 --- TTS/tts/models/base_tacotron.py | 2 -- recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py index a99e1926..b7056e06 100644 --- a/TTS/tts/models/base_tacotron.py +++ b/TTS/tts/models/base_tacotron.py @@ -281,6 +281,4 @@ class BaseTacotron(BaseTTS): self.decoder.set_r(r) if trainer.config.bidirectional_decoder: trainer.model.decoder_backward.set_r(r) - trainer.train_loader = trainer.setup_train_dataloader(self.ap, self.model.decoder.r, verbose=True) - trainer.eval_loader = trainer.setup_eval_dataloder(self.ap, self.model.decoder.r) print(f"\n > Number of output frames: {self.decoder.r}") diff --git a/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json b/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json index e3531851..d787c138 100644 --- a/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json +++ b/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json @@ -1,4 +1,5 @@ { + "model": "Tacotron2", "datasets": [ { "name": "ljspeech", @@ -38,7 +39,6 @@ }, "distributed_backend": "gloo", "distributed_url": "tcp:\/\/localhost:54321", - "model": "Tacotron2", "run_name": "ljspeech-ddc", "run_description": "tacotron2 with double decoder consistency.", "batch_size": 64, From 8fbadad68e4e511d759bba96f8ed26530af0f961 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 6 Jul 2021 14:44:59 +0200 Subject: [PATCH 256/258] Bump up to v0.1.2 --- TTS/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/VERSION b/TTS/VERSION index 6da28dde..8294c184 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.1.1 \ No newline at end of file +0.1.2 \ No newline at end of file From 6e3e6d57563a33e221a672598c9d8d45dc75fd7c Mon Sep 17 00:00:00 2001 From: Aloento <11802769+Aloento@users.noreply.github.com> Date: Thu, 8 Jul 2021 09:53:13 +0200 Subject: [PATCH 257/258] Change to _get_preprocessor_by_name --- TTS/bin/find_unique_chars.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py index 75169569..9e62657f 100644 --- a/TTS/bin/find_unique_chars.py +++ b/TTS/bin/find_unique_chars.py @@ -3,7 +3,7 @@ import argparse import os from argparse import RawTextHelpFormatter -from TTS.tts.datasets.formatters import get_preprocessor_by_name +from TTS.tts.datasets import _get_preprocessor_by_name def main(): @@ -27,7 +27,7 @@ def main(): args = parser.parse_args() - preprocessor = get_preprocessor_by_name(args.dataset) + preprocessor = _get_preprocessor_by_name(args.dataset) items = preprocessor(os.path.dirname(args.meta_file), os.path.basename(args.meta_file)) texts = "".join(item[0] for item in items) chars = set(texts) From 4eac1c4651f0e9adf3d0618cfac10ea4d4e8bd01 Mon Sep 17 00:00:00 2001 From: Edresson Date: Sun, 11 Jul 2021 12:00:39 -0300 Subject: [PATCH 258/258] bug fix on train_encoder and unit tests --- TTS/bin/train_encoder.py | 2 +- tests/test_speaker_encoder_train.py | 49 ++++++++++++++++++----------- 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 38902a18..2bb5bfc7 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -164,7 +164,7 @@ def main(args): # pylint: disable=redefined-outer-name elif c.loss == "angleproto": criterion = AngleProtoLoss() elif c.loss == "softmaxproto": - criterion = SoftmaxAngleProtoLoss(c.model["proj_dim"], num_speakers) + criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_speakers) else: raise Exception("The %s not is a loss supported" % c.loss) diff --git a/tests/test_speaker_encoder_train.py b/tests/test_speaker_encoder_train.py index 21b12074..4419a00f 100644 --- a/tests/test_speaker_encoder_train.py +++ b/tests/test_speaker_encoder_train.py @@ -6,7 +6,18 @@ from tests import get_device_id, get_tests_output_path, run_cli from TTS.config.shared_configs import BaseAudioConfig from TTS.speaker_encoder.speaker_encoder_config import SpeakerEncoderConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") +def run_test_train(): + command = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.name ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + ) + run_cli(command) + +config_path = os.path.join(get_tests_output_path(), "test_speaker_encoder_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") config = SpeakerEncoderConfig( @@ -24,16 +35,9 @@ config.audio.do_trim_silence = True config.audio.trim_db = 60 config.save_json(config_path) +print(config) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.name ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " -) -run_cli(command_train) +run_test_train() # Find latest folder continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) @@ -50,15 +54,7 @@ config.model_params["model_name"] = "resnet" config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.name ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " -) -run_cli(command_train) +run_test_train() # Find latest folder continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) @@ -69,3 +65,18 @@ command_train = ( ) run_cli(command_train) shutil.rmtree(continue_path) + +# test model with ge2e loss function +config.loss = "ge2e" +config.save_json(config_path) +run_test_train() + +# test model with angleproto loss function +config.loss = "angleproto" +config.save_json(config_path) +run_test_train() + +# test model with softmaxproto loss function +config.loss = "softmaxproto" +config.save_json(config_path) +run_test_train()