diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 74d5e85b..68be9274 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -18,8 +18,8 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.6, 3.7, 3.8] - + python-version: [3.6, 3.7, 3.8, 3.9] + experimental: [false] steps: - uses: actions/checkout@v2 - uses: actions/cache@v1 diff --git a/.gitignore b/.gitignore index 7ca905ff..6f412def 100644 --- a/.gitignore +++ b/.gitignore @@ -124,7 +124,9 @@ version.py # jupyter dummy files core +# files used internally fro dev, test etc. tests/outputs/* +tests/train_outputs/* TODO.txt .vscode/* data/* @@ -132,7 +134,22 @@ notebooks/data/* TTS/tts/layers/glow_tts/monotonic_align/core.c .vscode-upload.json temp_build/* -recipes/* - -# nohup logs +recipes/WIP/* +recipes/ljspeech/LJSpeech-1.1/* +events.out* +old_configs/* +model_importers/* +model_profiling/* +docs/source/TODO/* +docs/source/models/* +.noseids +.dccache +log.txt +umap.png *.out +SocialMedia.txt +output.wav +tts_output.wav +deps.json +speakers.json +internal/* \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1ae28644..a70572dc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,4 +9,19 @@ repos: rev: 20.8b1 hooks: - id: black - language_version: python3 \ No newline at end of file + language_version: python3 + - repo: https://github.com/pycqa/isort + rev: 5.8.0 + hooks: + - id: isort + name: isort (python) + - id: isort + name: isort (cython) + types: [cython] + - id: isort + name: isort (pyi) + types: [pyi] + - repo: https://github.com/pycqa/pylint + rev: v2.8.2 + hooks: + - id: pylint diff --git a/.pylintrc b/.pylintrc index 34c121eb..7293f5ad 100644 --- a/.pylintrc +++ b/.pylintrc @@ -61,6 +61,9 @@ confidence= # no Warning level messages displayed, use "--disable=all --enable=classes # --disable=W". disable=missing-docstring, + too-many-public-methods, + too-many-lines, + bare-except, line-too-long, fixme, wrong-import-order, diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 00000000..946d363c --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,18 @@ +# .readthedocs.yml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Build documentation in the docs/ directory with Sphinx +sphinx: + builder: html + configuration: docs/source/conf.py + +# Optionally set the version of Python and requirements required to build your docs +python: + version: 3.7 + install: + - requirements: docs/requirements.txt + - requirements: requirements.txt \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2a9620a7..831eddd5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,7 +2,7 @@ Welcome to the 🐸TTS! -This repository is governed by the Contributor Covenant Code of Conduct - [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md). +This repository is governed by [the Contributor Covenant Code of Conduct](https://github.com/coqui-ai/TTS/blob/main/CODE_OF_CONDUCT.md). ## Where to start. We welcome everyone who likes to contribute to 🐸TTS. diff --git a/MANIFEST.in b/MANIFEST.in index 861cb5a7..0d8b4b4c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,7 @@ include README.md include LICENSE.txt include requirements.*.txt +include requirements.txt include TTS/VERSION recursive-include TTS *.json recursive-include TTS *.html diff --git a/Makefile b/Makefile index 4dc2d588..d3d7dd41 100644 --- a/Makefile +++ b/Makefile @@ -6,15 +6,9 @@ help: target_dirs := tests TTS notebooks -system-deps: ## install linux system deps - sudo apt-get install -y libsndfile1-dev - -dev-deps: ## install development deps - pip install -r requirements.dev.txt - pip install -r requirements.tf.txt - -deps: ## install 🐸 requirements. - pip install -r requirements.txt +test_all: ## run tests and don't stop on an error. + nosetests --with-cov -cov --cover-erase --cover-package TTS tests --nologcapture --with-id + ./run_bash_tests.sh test: ## run tests. nosetests -x --with-cov -cov --cover-erase --cover-package TTS tests --nologcapture --with-id @@ -30,5 +24,24 @@ style: ## update code style. lint: ## run pylint linter. pylint ${target_dirs} +system-deps: ## install linux system deps + sudo apt-get install -y libsndfile1-dev + +dev-deps: ## install development deps + pip install -r requirements.dev.txt + pip install -r requirements.tf.txt + +doc-deps: ## install docs dependencies + pip install -r docs/requirements.txt + +build-docs: ## build the docs + cd docs && make clean && make build + +hub-deps: ## install deps for torch hub use + pip install -r requirements.hub.txt + +deps: ## install 🐸 requirements. + pip install -r requirements.txt + install: ## install 🐸 TTS for development. pip install -e .[all] diff --git a/README.md b/README.md index 92c2ee52..ee7f91f2 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,9 @@ 🐸TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality. 🐸TTS comes with [pretrained models](https://github.com/coqui-ai/TTS/wiki/Released-Models), tools for measuring dataset quality and already used in **20+ languages** for products and research projects. -[![CircleCI](https://github.com/coqui-ai/TTS/actions/workflows/main.yml/badge.svg)]() +[![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/main.yml/badge.svg)](https://github.com/coqui-ai/TTS/actions) [![License]()](https://opensource.org/licenses/MPL-2.0) +[![Docs]()](https://tts.readthedocs.io/en/latest/) [![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS) [![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md) [![Downloads](https://pepy.tech/badge/tts)](https://pepy.tech/project/tts) @@ -16,20 +17,17 @@ 📢 [English Voice Samples](https://erogol.github.io/ddc-samples/) and [SoundCloud playlist](https://soundcloud.com/user-565970875/pocket-article-wavernn-and-tacotron2) -👩🏽‍🍳 [TTS training recipes](https://github.com/erogol/TTS_recipes) - 📄 [Text-to-Speech paper collection](https://github.com/erogol/TTS-papers) ## 💬 Where to ask questions -Please use our dedicated channels for questions and discussion. Help is much more valuable if it's shared publicly, so that more people can benefit from it. +Please use our dedicated channels for questions and discussion. Help is much more valuable if it's shared publicly so that more people can benefit from it. | Type | Platforms | | ------------------------------- | --------------------------------------- | | 🚨 **Bug Reports** | [GitHub Issue Tracker] | -| ❔ **FAQ** | [TTS/Wiki](https://github.com/coqui-ai/TTS/wiki/FAQ) | | 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker] | | 👩‍💻 **Usage Questions** | [Github Discussions] | -| 🗯 **General Discussion** | [Github Discussions] or [Gitter Room]| +| 🗯 **General Discussion** | [Github Discussions] or [Gitter Room] | [github issue tracker]: https://github.com/coqui-ai/tts/issues [github discussions]: https://github.com/coqui-ai/TTS/discussions @@ -40,14 +38,11 @@ Please use our dedicated channels for questions and discussion. Help is much mor ## 🔗 Links and Resources | Type | Links | | ------------------------------- | --------------------------------------- | +| 💼 **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/) | 💾 **Installation** | [TTS/README.md](https://github.com/coqui-ai/TTS/tree/dev#install-tts)| | 👩‍💻 **Contributing** | [CONTRIBUTING.md](https://github.com/coqui-ai/TTS/blob/main/CONTRIBUTING.md)| | 📌 **Road Map** | [Main Development Plans](https://github.com/coqui-ai/TTS/issues/378) -| 👩🏾‍🏫 **Tutorials and Examples** | [TTS/Wiki](https://github.com/coqui-ai/TTS/wiki/%F0%9F%90%B8-TTS-Notebooks,-Examples-and-Tutorials) | | 🚀 **Released Models** | [TTS Releases](https://github.com/coqui-ai/TTS/releases) and [Experimental Models](https://github.com/coqui-ai/TTS/wiki/Experimental-Released-Models)| -| 🖥️ **Demo Server** | [TTS/server](https://github.com/coqui-ai/TTS/tree/master/TTS/server)| -| 🤖 **Synthesize speech** | [TTS/README.md](https://github.com/coqui-ai/TTS#example-synthesizing-speech-on-terminal-using-the-released-models)| -| 🛠️ **Implementing a New Model** | [TTS/Wiki](https://github.com/coqui-ai/TTS/wiki/Implementing-a-New-Model-in-%F0%9F%90%B8TTS)| ## 🥇 TTS Performance

@@ -56,20 +51,19 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models ## Features -- High performance Deep Learning models for Text2Speech tasks. +- High-performance Deep Learning models for Text2Speech tasks. - Text2Spec models (Tacotron, Tacotron2, Glow-TTS, SpeedySpeech). - Speaker Encoder to compute speaker embeddings efficiently. - Vocoder models (MelGAN, Multiband-MelGAN, GAN-TTS, ParallelWaveGAN, WaveGrad, WaveRNN) - Fast and efficient model training. -- Detailed training logs on console and Tensorboard. -- Support for multi-speaker TTS. -- Efficient Multi-GPUs training. +- Detailed training logs on the terminal and Tensorboard. +- Support for Multi-speaker TTS. +- Efficient, flexible, lightweight but feature complete `Trainer API`. - Ability to convert PyTorch models to Tensorflow 2.0 and TFLite for inference. -- Released models in PyTorch, Tensorflow and TFLite. +- Released and read-to-use models. - Tools to curate Text2Speech datasets under```dataset_analysis```. -- Demo server for model testing. -- Notebooks for extensive model benchmarking. -- Modular (but not too much) code base enabling easy testing for new ideas. +- Utilities to use and test your models. +- Modular (but not too much) code base enabling easy implementation of new ideas. ## Implemented Models ### Text-to-Spectrogram @@ -98,8 +92,9 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models - WaveRNN: [origin](https://github.com/fatchord/WaveRNN/) - WaveGrad: [paper](https://arxiv.org/abs/2009.00713) - HiFiGAN: [paper](https://arxiv.org/abs/2010.05646) +- UnivNet: [paper](https://arxiv.org/abs/2106.07889) -You can also help us implement more models. Some 🐸TTS related work can be found [here](https://github.com/erogol/TTS-papers). +You can also help us implement more models. ## Install TTS 🐸TTS is tested on Ubuntu 18.04 with **python >= 3.6, < 3.9**. @@ -110,7 +105,7 @@ If you are only interested in [synthesizing speech](https://github.com/coqui-ai/ pip install TTS ``` -By default this only installs the requirements for PyTorch. To install the tensorflow dependencies as well, use the `tf` extra. +By default, this only installs the requirements for PyTorch. To install the tensorflow dependencies as well, use the `tf` extra. ```bash pip install TTS[tf] @@ -123,12 +118,6 @@ git clone https://github.com/coqui-ai/TTS pip install -e .[all,dev,notebooks,tf] # Select the relevant extras ``` -We use ```espeak-ng``` to convert graphemes to phonemes. You might need to install separately. - -```bash -sudo apt-get install espeak-ng -``` - If you are on Ubuntu (Debian), you can also run following commands for installation. ```bash @@ -137,6 +126,7 @@ $ make install ``` If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system). + ## Directory Structure ``` |- notebooks/ (Jupyter Notebooks for model evaluation, parameter selection and data analysis.) @@ -147,6 +137,7 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht |- distribute.py (train your TTS model using Multiple GPUs.) |- compute_statistics.py (compute dataset statistics for normalization.) |- convert*.py (convert target torch model to TF.) + |- ... |- tts/ (text to speech models) |- layers/ (model layer definitions) |- models/ (model definitions) @@ -156,167 +147,4 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht |- (same) |- vocoder/ (Vocoder models.) |- (same) -``` - -## Sample Model Output -Below you see Tacotron model state after 16K iterations with batch-size 32 with LJSpeech dataset. - -> "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning." - -Audio examples: [soundcloud](https://soundcloud.com/user-565970875/pocket-article-wavernn-and-tacotron2) - -example_output - -## Datasets and Data-Loading -🐸TTS provides a generic dataloader easy to use for your custom dataset. -You just need to write a simple function to format the dataset. Check ```datasets/preprocess.py``` to see some examples. -After that, you need to set ```dataset``` fields in ```config.json```. - -Some of the public datasets that we successfully applied 🐸TTS: - -- [LJ Speech](https://keithito.com/LJ-Speech-Dataset/) -- [Nancy](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/) -- [TWEB](https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset) -- [M-AI-Labs](http://www.caito.de/2019/01/the-m-ailabs-speech-dataset/) -- [LibriTTS](https://openslr.org/60/) -- [Spanish](https://drive.google.com/file/d/1Sm_zyBo67XHkiFhcRSQ4YaHPYM0slO_e/view?usp=sharing) - thx! @carlfm01 - -## Example: Synthesizing Speech on Terminal Using the Released Models. - - -After the installation, 🐸TTS provides a CLI interface for synthesizing speech using pre-trained models. You can either use your own model or the release models under 🐸TTS. - -Listing released 🐸TTS models. - -```bash -tts --list_models -``` - -Run a TTS model, from the release models list, with its default vocoder. (Simply copy and paste the full model names from the list as arguments for the command below.) - -```bash -tts --text "Text for TTS" \ - --model_name "///" \ - --out_path folder/to/save/output.wav -``` - -Run a tts and a vocoder model from the released model list. Note that not every vocoder is compatible with every TTS model. - -```bash -tts --text "Text for TTS" \ - --model_name "///" \ - --vocoder_name "///" \ - --out_path folder/to/save/output.wav -``` - -Run your own TTS model (Using Griffin-Lim Vocoder) - -```bash -tts --text "Text for TTS" \ - --model_path path/to/model.pth.tar \ - --config_path path/to/config.json \ - --out_path folder/to/save/output.wav -``` - -Run your own TTS and Vocoder models - -```bash -tts --text "Text for TTS" \ - --config_path path/to/config.json \ - --model_path path/to/model.pth.tar \ - --out_path folder/to/save/output.wav \ - --vocoder_path path/to/vocoder.pth.tar \ - --vocoder_config_path path/to/vocoder_config.json -``` - -Run a multi-speaker TTS model from the released models list. - -```bash -tts --model_name "///" --list_speaker_idxs # list the possible speaker IDs. -tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "//" --speaker_idx "" -``` - -**Note:** You can use ```./TTS/bin/synthesize.py``` if you prefer running ```tts``` from the TTS project folder. - -## Example: Using the Demo Server for Synthesizing Speech - - - - -You can boot up a demo 🐸TTS server to run inference with your models. Note that the server is not optimized for performance -but gives you an easy way to interact with the models. - -The demo server provides pretty much the same interface as the CLI command. - -```bash -tts-server -h # see the help -tts-server --list_models # list the available models. -``` - -Run a TTS model, from the release models list, with its default vocoder. -If the model you choose is a multi-speaker TTS model, you can select different speakers on the Web interface and synthesize -speech. - -```bash -tts-server --model_name "///" -``` - -Run a TTS and a vocoder model from the released model list. Note that not every vocoder is compatible with every TTS model. - -```bash -tts-server --model_name "///" \ - --vocoder_name "///" -``` - - -## Example: Training and Fine-tuning LJ-Speech Dataset -Here you can find a [CoLab](https://gist.github.com/erogol/97516ad65b44dbddb8cd694953187c5b) notebook for a hands-on example, training LJSpeech. Or you can manually follow the guideline below. - -To start with, split ```metadata.csv``` into train and validation subsets respectively ```metadata_train.csv``` and ```metadata_val.csv```. Note that for text-to-speech, validation performance might be misleading since the loss value does not directly measure the voice quality to the human ear and it also does not measure the attention module performance. Therefore, running the model with new sentences and listening to the results is the best way to go. - -``` -shuf metadata.csv > metadata_shuf.csv -head -n 12000 metadata_shuf.csv > metadata_train.csv -tail -n 1100 metadata_shuf.csv > metadata_val.csv -``` - -To train a new model, you need to define your own ```config.json``` to define model details, trainin configuration and more (check the examples). Then call the corressponding train script. - -For instance, in order to train a tacotron or tacotron2 model on LJSpeech dataset, follow these steps. - -```bash -python TTS/bin/train_tacotron.py --config_path TTS/tts/configs/config.json -``` - -To fine-tune a model, use ```--restore_path```. - -```bash -python TTS/bin/train_tacotron.py --config_path TTS/tts/configs/config.json --restore_path /path/to/your/model.pth.tar -``` - -To continue an old training run, use ```--continue_path```. - -```bash -python TTS/bin/train_tacotron.py --continue_path /path/to/your/run_folder/ -``` - -For multi-GPU training, call ```distribute.py```. It runs any provided train script in multi-GPU setting. - -```bash -CUDA_VISIBLE_DEVICES="0,1,4" python TTS/bin/distribute.py --script train_tacotron.py --config_path TTS/tts/configs/config.json -``` - -Each run creates a new output folder accomodating used ```config.json```, model checkpoints and tensorboard logs. - -In case of any error or intercepted execution, if there is no checkpoint yet under the output folder, the whole folder is going to be removed. - -You can also enjoy Tensorboard, if you point Tensorboard argument```--logdir``` to the experiment folder. - -## [Contribution guidelines](https://github.com/coqui-ai/TTS/blob/main/CONTRIBUTING.md) -### Acknowledgement -- https://github.com/keithito/tacotron (Dataset pre-processing) -- https://github.com/r9y9/tacotron_pytorch (Initial Tacotron architecture) -- https://github.com/kan-bayashi/ParallelWaveGAN (GAN based vocoder library) -- https://github.com/jaywalnut310/glow-tts (Original Glow-TTS implementation) -- https://github.com/fatchord/WaveRNN/ (Original WaveRNN implementation) -- https://arxiv.org/abs/2010.05646 (Original HiFiGAN implementation) +``` \ No newline at end of file diff --git a/TTS/.models.json b/TTS/.models.json index 310dc5f0..73204db6 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -4,10 +4,9 @@ "ek1":{ "tacotron2": { "description": "EK1 en-rp tacotron2 by NMStoker", - "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.10/tts_models--en--ek1--tacotron2.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.0/tts_models--en--ek1--tacotron2.zip", "default_vocoder": "vocoder_models/en/ek1/wavegrad", - "commit": "c802255", - "needs_phonemizer": true + "commit": "c802255" } }, "ljspeech":{ @@ -18,8 +17,7 @@ "commit": "bae2ad0f", "author": "Eren Gölge @erogol", "license": "", - "contact":"egolge@coqui.com", - "needs_phonemizer": false + "contact":"egolge@coqui.com" }, "glow-tts":{ "description": "", @@ -29,8 +27,7 @@ "commit": "", "author": "Eren Gölge @erogol", "license": "MPL", - "contact":"egolge@coqui.com", - "needs_phonemizer": true + "contact":"egolge@coqui.com" }, "tacotron2-DCA": { "description": "", @@ -39,30 +36,27 @@ "commit": "", "author": "Eren Gölge @erogol", "license": "MPL", - "contact":"egolge@coqui.com", - "needs_phonemizer": true + "contact":"egolge@coqui.com" }, "speedy-speech-wn":{ "description": "Speedy Speech model with wavenet decoder.", - "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.10/tts_models--en--ljspeech--speedy-speech-wn.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.0/tts_models--en--ljspeech--speedy-speech-wn.zip", "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan", "commit": "77b6145", "author": "Eren Gölge @erogol", "license": "MPL", - "contact":"egolge@coqui.com", - "needs_phonemizer": true + "contact":"egolge@coqui.com" } }, "vctk":{ "sc-glow-tts": { "description": "Multi-Speaker Transformers based SC-Glow model from https://arxiv.org/abs/2104.05557.", - "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.12/tts_models--en--vctk--sc-glowtts-transformer.zip", - "default_vocoder": null, + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.0/tts_models--en--vctk--sc-glow-tts.zip", + "default_vocoder": "vocoder_models/en/vctk/hifigan_v2", "commit": "b531fa69", "author": "Edresson Casanova", "license": "", - "contact":"", - "needs_phonemizer": true + "contact":"" } @@ -75,8 +69,7 @@ "commit": "bae2ad0f", "author": "Eren Gölge @erogol", "license": "", - "contact":"egolge@coqui.com", - "needs_phonemizer": true + "contact":"egolge@coqui.com" } } }, @@ -88,8 +81,7 @@ "commit": "", "author": "Eren Gölge @erogol", "license": "MPL", - "contact":"egolge@coqui.com", - "needs_phonemizer": true + "contact":"egolge@coqui.com" } } }, @@ -101,8 +93,7 @@ "commit": "", "author": "Eren Gölge @erogol", "license": "MPL", - "contact":"egolge@coqui.com", - "needs_phonemizer": true + "contact":"egolge@coqui.com" } } }, @@ -122,8 +113,7 @@ "author": "@r-dh", "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan", "stats_file": null, - "commit": "540d811", - "needs_phonemizer": true + "commit": "540d811" } } }, @@ -134,8 +124,7 @@ "author": "@erogol", "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan", "license":"", - "contact": "egolge@coqui.com", - "needs_phonemizer": true + "contact": "egolge@coqui.com" } } }, @@ -145,8 +134,7 @@ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.11/tts_models--de--thorsten--tacotron2-DCA.zip", "default_vocoder": "vocoder_models/de/thorsten/wavegrad", "author": "@thorstenMueller", - "commit": "unknown", - "needs_phonemizer": true + "commit": "unknown" } } }, @@ -157,8 +145,7 @@ "default_vocoder": "vocoder_models/universal/libri-tts/wavegrad", "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.", "author": "@kaiidams", - "commit": "401fbd89", - "needs_phonemizer": false + "commit": "401fbd89" } } } diff --git a/TTS/VERSION b/TTS/VERSION index e3b86dd9..8294c184 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.0.16 +0.1.2 \ No newline at end of file diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index 0a4337da..88d60d7d 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -8,8 +8,8 @@ import torch from torch.utils.data import DataLoader from tqdm import tqdm -from TTS.tts.datasets.TTSDataset import MyDataset -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.datasets.TTSDataset import TTSDataset +from TTS.tts.models import setup_model from TTS.tts.utils.io import load_checkpoint from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.utils.audio import AudioProcessor @@ -75,21 +75,21 @@ Example run: # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) # TODO: handle multi-speaker - model = setup_model(num_chars, num_speakers=0, c=C) + model = setup_model(C) model, _ = load_checkpoint(model, args.model_path, None, args.use_cuda) model.eval() # data loader - preprocessor = importlib.import_module("TTS.tts.datasets.preprocess") + preprocessor = importlib.import_module("TTS.tts.datasets.formatters") preprocessor = getattr(preprocessor, args.dataset) meta_data = preprocessor(args.data_path, args.dataset_metafile) - dataset = MyDataset( + dataset = TTSDataset( model.decoder.r, C.text_cleaner, compute_linear_spec=False, ap=ap, meta_data=meta_data, - tp=C.characters if "characters" in C.keys() else None, + characters=C.characters if "characters" in C.keys() else None, add_blank=C["add_blank"] if "add_blank" in C.keys() else False, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index e843150b..19bfbe3a 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -1,12 +1,17 @@ + import argparse +import glob import os +import torch from tqdm import tqdm -from TTS.config import load_config -from TTS.tts.datasets.preprocess import load_meta_data +from TTS.config import BaseDatasetConfig, load_config +from TTS.speaker_encoder.utils.generic_utils import setup_model +from TTS.tts.datasets import load_meta_data from TTS.tts.utils.speakers import SpeakerManager + parser = argparse.ArgumentParser( description='Compute embedding vectors for each wav file in a dataset.' ) @@ -24,11 +29,13 @@ parser.add_argument( ) parser.add_argument("output_path", type=str, help="path for output speakers.json and/or speakers.npy.") parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True) +parser.add_argument("--eval", type=bool, help="compute eval.", default=True) + args = parser.parse_args() c_dataset = load_config(args.config_dataset_path) -train_files, dev_files = load_meta_data(c_dataset.datasets, eval_split=True, ignore_generated_eval=True) +train_files, dev_files = load_meta_data(c_dataset.datasets, eval_split=args.eval, ignore_generated_eval=True) wav_files = train_files + dev_files speaker_manager = SpeakerManager(encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda) @@ -43,7 +50,7 @@ for idx, wav_file in enumerate(tqdm(wav_files)): speaker_name = None # extract the embedding - embedd = speaker_manager.compute_x_vector_from_clip(wav_file) + embedd = speaker_manager.compute_d_vector_from_clip(wav_file) # create speaker_mapping if target dataset is defined wav_file_name = os.path.basename(wav_file) diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index f3234c2a..6179dafc 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -10,7 +10,7 @@ from tqdm import tqdm # from TTS.utils.io import load_config from TTS.config import load_config -from TTS.tts.datasets.preprocess import load_meta_data +from TTS.tts.datasets import load_meta_data from TTS.utils.audio import AudioProcessor @@ -77,7 +77,7 @@ def main(): print(f" > Avg mel spec mean: {mel_mean.mean()}") print(f" > Avg mel spec scale: {mel_scale.mean()}") print(f" > Avg linear spec mean: {linear_mean.mean()}") - print(f" > Avg lienar spec scale: {linear_scale.mean()}") + print(f" > Avg linear spec scale: {linear_scale.mean()}") # set default config values for mean-var scaling CONFIG.audio.stats_path = output_file_path diff --git a/TTS/bin/convert_tacotron2_torch_to_tf.py b/TTS/bin/convert_tacotron2_torch_to_tf.py index d523d01e..a6fb5d9b 100644 --- a/TTS/bin/convert_tacotron2_torch_to_tf.py +++ b/TTS/bin/convert_tacotron2_torch_to_tf.py @@ -8,10 +8,10 @@ import numpy as np import tensorflow as tf import torch +from TTS.tts.models import setup_model from TTS.tts.tf.models.tacotron2 import Tacotron2 from TTS.tts.tf.utils.convert_torch_to_tf_utils import compare_torch_tf, convert_tf_name, transfer_weights_torch_to_tf from TTS.tts.tf.utils.generic_utils import save_checkpoint -from TTS.tts.utils.generic_utils import setup_model from TTS.tts.utils.text.symbols import phonemes, symbols from TTS.utils.io import load_config @@ -31,18 +31,18 @@ c = load_config(config_path) num_speakers = 0 # init torch model -num_chars = len(phonemes) if c.use_phonemes else len(symbols) -model = setup_model(num_chars, num_speakers, c) +model = setup_model(c) checkpoint = torch.load(args.torch_model_path, map_location=torch.device("cpu")) state_dict = checkpoint["model"] model.load_state_dict(state_dict) # init tf model +num_chars = len(phonemes) if c.use_phonemes else len(symbols) model_tf = Tacotron2( num_chars=num_chars, num_speakers=num_speakers, r=model.decoder.r, - postnet_output_dim=c.audio["num_mels"], + out_channels=c.audio["num_mels"], decoder_output_dim=c.audio["num_mels"], attn_type=c.attention_type, attn_win=c.windowing, diff --git a/TTS/bin/distribute.py b/TTS/bin/distribute.py index ea43f88b..e05747d0 100644 --- a/TTS/bin/distribute.py +++ b/TTS/bin/distribute.py @@ -1,47 +1,38 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -import argparse import os import pathlib import subprocess -import sys import time import torch +from TTS.trainer import TrainingArgs + def main(): """ Call train.py as a new process and pass command arguments """ - parser = argparse.ArgumentParser() + parser = TrainingArgs().init_argparse(arg_prefix="") parser.add_argument("--script", type=str, help="Target training script to distibute.") - parser.add_argument( - "--continue_path", - type=str, - help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default="", - required="--config_path" not in sys.argv, - ) - parser.add_argument( - "--restore_path", type=str, help="Model file to be restored. Use to finetune a model.", default="" - ) - parser.add_argument( - "--config_path", type=str, help="Path to config file for training.", required="--continue_path" not in sys.argv - ) - args = parser.parse_args() + args, unargs = parser.parse_known_args() num_gpus = torch.cuda.device_count() group_id = time.strftime("%Y_%m_%d-%H%M%S") # set arguments for train.py folder_path = pathlib.Path(__file__).parent.absolute() - command = [os.path.join(folder_path, args.script)] + if os.path.exists(os.path.join(folder_path, args.script)): + command = [os.path.join(folder_path, args.script)] + else: + command = [args.script] command.append("--continue_path={}".format(args.continue_path)) command.append("--restore_path={}".format(args.restore_path)) command.append("--config_path={}".format(args.config_path)) command.append("--group_id=group_{}".format(group_id)) + command += unargs command.append("") # run processes @@ -50,6 +41,7 @@ def main(): my_env = os.environ.copy() my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i) command[-1] = "--rank={}".format(i) + # prevent stdout for processes with rank != 0 stdout = None if i == 0 else open(os.devnull, "w") p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env) # pylint: disable=consider-using-with processes.append(p) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index a0551484..0bd84db1 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -10,11 +10,10 @@ from torch.utils.data import DataLoader from tqdm import tqdm from TTS.config import load_config -from TTS.tts.datasets.preprocess import load_meta_data -from TTS.tts.datasets.TTSDataset import MyDataset -from TTS.tts.utils.generic_utils import setup_model -from TTS.tts.utils.speakers import parse_speakers -from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols +from TTS.tts.datasets import load_meta_data +from TTS.tts.datasets.TTSDataset import TTSDataset +from TTS.tts.models import setup_model +from TTS.tts.utils.speakers import get_speaker_manager from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import count_parameters @@ -22,13 +21,13 @@ use_cuda = torch.cuda.is_available() def setup_loader(ap, r, verbose=False): - dataset = MyDataset( + dataset = TTSDataset( r, c.text_cleaner, compute_linear_spec=False, meta_data=meta_data, ap=ap, - tp=c.characters if "characters" in c.keys() else None, + characters=c.characters if "characters" in c.keys() else None, add_blank=c["add_blank"] if "add_blank" in c.keys() else False, batch_group_size=0, min_seq_len=c.min_seq_len, @@ -39,7 +38,8 @@ def setup_loader(ap, r, verbose=False): enable_eos_bos=c.enable_eos_bos_chars, use_noise_augment=False, verbose=verbose, - speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None, + speaker_id_mapping=speaker_manager.speaker_ids, + d_vector_mapping=speaker_manager.d_vectors if c.use_speaker_embedding and c.use_d_vector_file else None, ) if c.use_phonemes and c.compute_input_seq_cache: @@ -78,26 +78,15 @@ def format_data(data): # setup input data text_input = data[0] text_lengths = data[1] - speaker_names = data[2] mel_input = data[4] mel_lengths = data[5] item_idx = data[7] - attn_mask = data[9] + d_vectors = data[8] + speaker_ids = data[9] + attn_mask = data[10] avg_text_length = torch.mean(text_lengths.float()) avg_spec_length = torch.mean(mel_lengths.float()) - if c.use_speaker_embedding: - if c.use_external_speaker_embedding_file: - speaker_embeddings = data[8] - speaker_ids = None - else: - speaker_ids = [speaker_mapping[speaker_name] for speaker_name in speaker_names] - speaker_ids = torch.LongTensor(speaker_ids) - speaker_embeddings = None - else: - speaker_embeddings = None - speaker_ids = None - # dispatch data to GPU if use_cuda: text_input = text_input.cuda(non_blocking=True) @@ -106,9 +95,8 @@ def format_data(data): mel_lengths = mel_lengths.cuda(non_blocking=True) if speaker_ids is not None: speaker_ids = speaker_ids.cuda(non_blocking=True) - if speaker_embeddings is not None: - speaker_embeddings = speaker_embeddings.cuda(non_blocking=True) - + if d_vectors is not None: + d_vectors = d_vectors.cuda(non_blocking=True) if attn_mask is not None: attn_mask = attn_mask.cuda(non_blocking=True) return ( @@ -117,7 +105,7 @@ def format_data(data): mel_input, mel_lengths, speaker_ids, - speaker_embeddings, + d_vectors, avg_text_length, avg_spec_length, attn_mask, @@ -134,32 +122,26 @@ def inference( text_lengths, mel_input, mel_lengths, - attn_mask=None, speaker_ids=None, - speaker_embeddings=None, + d_vectors=None, ): if model_name == "glow_tts": - mel_input = mel_input.permute(0, 2, 1) # B x D x T speaker_c = None if speaker_ids is not None: speaker_c = speaker_ids - elif speaker_embeddings is not None: - speaker_c = speaker_embeddings + elif d_vectors is not None: + speaker_c = d_vectors - model_output, *_ = model.inference_with_MAS( - text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_c + outputs = model.inference_with_MAS( + text_input, text_lengths, mel_input, mel_lengths, aux_input={"d_vectors": speaker_c} ) + model_output = outputs["model_outputs"] model_output = model_output.transpose(1, 2).detach().cpu().numpy() elif "tacotron" in model_name: - _, postnet_outputs, *_ = model( - text_input, - text_lengths, - mel_input, - mel_lengths, - speaker_ids=speaker_ids, - speaker_embeddings=speaker_embeddings, - ) + aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} + outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input) + postnet_outputs = outputs["model_outputs"] # normalize tacotron output if model_name == "tacotron": mel_specs = [] @@ -188,10 +170,10 @@ def extract_spectrograms( mel_input, mel_lengths, speaker_ids, - speaker_embeddings, + d_vectors, + _, _, _, - attn_mask, item_idx, ) = format_data(data) @@ -203,9 +185,8 @@ def extract_spectrograms( text_lengths, mel_input, mel_lengths, - attn_mask, speaker_ids, - speaker_embeddings, + d_vectors, ) for idx in range(text_input.shape[0]): @@ -240,28 +221,22 @@ def extract_spectrograms( def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined - global meta_data, symbols, phonemes, model_characters, speaker_mapping + global meta_data, speaker_manager # Audio processor ap = AudioProcessor(**c.audio) - if "characters" in c.keys() and c["characters"]: - symbols, phonemes = make_symbols(**c.characters) - - # set model characters - model_characters = phonemes if c.use_phonemes else symbols - num_chars = len(model_characters) # load data instances - meta_data_train, meta_data_eval = load_meta_data(c.datasets) + meta_data_train, meta_data_eval = load_meta_data(c.datasets, eval_split=True, ignore_generated_eval=True) # use eval and training partitions meta_data = meta_data_train + meta_data_eval # parse speakers - num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(c, args, meta_data_train, None) + speaker_manager = get_speaker_manager(c, args, meta_data_train) # setup model - model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim=speaker_embedding_dim) + model = setup_model(c) # restore model checkpoint = torch.load(args.checkpoint_path, map_location="cpu") @@ -299,6 +274,5 @@ if __name__ == "__main__": args = parser.parse_args() c = load_config(args.config_path) - c.audio["do_trim_silence"] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel - + c.audio.trim_silence = False main(args) diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py index 8ac73235..6273b752 100644 --- a/TTS/bin/find_unique_chars.py +++ b/TTS/bin/find_unique_chars.py @@ -1,7 +1,7 @@ """Find all the unique characters in a dataset""" import argparse from argparse import RawTextHelpFormatter -from TTS.tts.datasets.preprocess import load_meta_data +from TTS.tts.datasets import load_meta_data from TTS.config import load_config @@ -9,7 +9,6 @@ def main(): # pylint: disable=bad-option-value parser = argparse.ArgumentParser( description="""Find all the unique characters or phonemes in a dataset.\n\n""" - """\n\n""" """ Example runs: @@ -23,9 +22,10 @@ def main(): args = parser.parse_args() c = load_config(args.config_path) + # load all datasets - train_items, dev_items = load_meta_data(c.datasets, eval_split=True, ignore_generated_eval=True) - items = train_items + dev_items + train_items, eval_items = load_meta_data(c.datasets, eval_split=True, ignore_generated_eval=True) + items = train_items + eval_items texts = "".join(item[0] for item in items) chars = set(texts) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index a5066e3d..9895c04e 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -157,7 +157,7 @@ def main(): parser.add_argument( "--speaker_wav", nargs="+", - help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The x_vectors is computed as their average.", + help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.", default=None, ) parser.add_argument("--gst_style", help="Wav path file for GST stylereference.", default=None) @@ -239,7 +239,7 @@ def main(): print( " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." ) - print(synthesizer.speaker_manager.speaker_ids) + print(synthesizer.tts_model.speaker_manager.speaker_ids) return # check the arguments against a multi-speaker model. diff --git a/TTS/bin/train_align_tts.py b/TTS/bin/train_align_tts.py deleted file mode 100644 index 7e3921b0..00000000 --- a/TTS/bin/train_align_tts.py +++ /dev/null @@ -1,572 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import os -import sys -import time -import traceback -from random import randrange - -import numpy as np -import torch -from torch.nn.parallel import DistributedDataParallel as DDP_th -from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler - -from TTS.tts.datasets.preprocess import load_meta_data -from TTS.tts.datasets.TTSDataset import MyDataset -from TTS.tts.layers.losses import AlignTTSLoss -from TTS.tts.utils.generic_utils import setup_model -from TTS.tts.utils.io import save_best_model, save_checkpoint -from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.speakers import parse_speakers -from TTS.tts.utils.synthesis import synthesis -from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.arguments import init_training -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import init_distributed, reduce_tensor -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.radam import RAdam -from TTS.utils.training import NoamLR, setup_torch_training_env - -use_cuda, num_gpus = setup_torch_training_env(True, False) -# torch.autograd.set_detect_anomaly(True) - - -def setup_loader(ap, r, is_val=False, verbose=False): - if is_val and not config.run_eval: - loader = None - else: - dataset = MyDataset( - r, - config.text_cleaner, - compute_linear_spec=False, - meta_data=meta_data_eval if is_val else meta_data_train, - ap=ap, - tp=config.characters, - add_blank=config["add_blank"], - batch_group_size=0 if is_val else config.batch_group_size * config.batch_size, - min_seq_len=config.min_seq_len, - max_seq_len=config.max_seq_len, - phoneme_cache_path=config.phoneme_cache_path, - use_phonemes=config.use_phonemes, - phoneme_language=config.phoneme_language, - enable_eos_bos=config.enable_eos_bos_chars, - use_noise_augment=not is_val, - verbose=verbose, - speaker_mapping=speaker_mapping - if config.use_speaker_embedding and config.use_external_speaker_embedding_file - else None, - ) - - if config.use_phonemes and config.compute_input_seq_cache: - # precompute phonemes to have a better estimate of sequence lengths. - dataset.compute_input_seq(config.num_loader_workers) - dataset.sort_items() - - sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=config.eval_batch_size if is_val else config.batch_size, - shuffle=False, - collate_fn=dataset.collate_fn, - drop_last=False, - sampler=sampler, - num_workers=config.num_val_loader_workers if is_val else config.num_loader_workers, - pin_memory=False, - ) - return loader - - -def format_data(data): - # setup input data - text_input = data[0] - text_lengths = data[1] - speaker_names = data[2] - mel_input = data[4].permute(0, 2, 1) # B x D x T - mel_lengths = data[5] - item_idx = data[7] - avg_text_length = torch.mean(text_lengths.float()) - avg_spec_length = torch.mean(mel_lengths.float()) - - if config.use_speaker_embedding: - if config.use_external_speaker_embedding_file: - # return precomputed embedding vector - speaker_c = data[8] - else: - # return speaker_id to be used by an embedding layer - speaker_c = [speaker_mapping[speaker_name] for speaker_name in speaker_names] - speaker_c = torch.LongTensor(speaker_c) - else: - speaker_c = None - # dispatch data to GPU - if use_cuda: - text_input = text_input.cuda(non_blocking=True) - text_lengths = text_lengths.cuda(non_blocking=True) - mel_input = mel_input.cuda(non_blocking=True) - mel_lengths = mel_lengths.cuda(non_blocking=True) - if speaker_c is not None: - speaker_c = speaker_c.cuda(non_blocking=True) - return text_input, text_lengths, mel_input, mel_lengths, speaker_c, avg_text_length, avg_spec_length, item_idx - - -def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch, training_phase): - - model.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (config.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / config.batch_size) - end_time = time.time() - c_logger.print_train_start() - scaler = torch.cuda.amp.GradScaler() if config.mixed_precision else None - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - ( - text_input, - text_lengths, - mel_targets, - mel_lengths, - speaker_c, - avg_text_length, - avg_spec_length, - _, - ) = format_data(data) - - loader_time = time.time() - end_time - - global_step += 1 - optimizer.zero_grad() - - # forward pass model - with torch.cuda.amp.autocast(enabled=config.mixed_precision): - decoder_output, dur_output, dur_mas_output, alignments, _, _, logp = model.forward( - text_input, text_lengths, mel_targets, mel_lengths, g=speaker_c, phase=training_phase - ) - - # compute loss - loss_dict = criterion( - logp, - decoder_output, - mel_targets, - mel_lengths, - dur_output, - dur_mas_output, - text_lengths, - global_step, - phase=training_phase, - ) - - # backward pass with loss scaling - if config.mixed_precision: - scaler.scale(loss_dict["loss"]).backward() - scaler.unscale_(optimizer) - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) - scaler.step(optimizer) - scaler.update() - else: - loss_dict["loss"].backward() - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) - optimizer.step() - - # setup lr - if config.noam_schedule: - scheduler.step() - - # current_lr - current_lr = optimizer.param_groups[0]["lr"] - - # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(alignments, binary=True) - loss_dict["align_error"] = align_error - - step_time = time.time() - start_time - epoch_time += step_time - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["loss_l1"] = reduce_tensor(loss_dict["loss_l1"].data, num_gpus) - loss_dict["loss_ssim"] = reduce_tensor(loss_dict["loss_ssim"].data, num_gpus) - loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - keep_avg.update_values(update_train_values) - - # print training progress - if global_step % config.print_step == 0: - log_dict = { - "avg_spec_length": [avg_spec_length, 1], # value, precision - "avg_text_length": [avg_text_length, 1], - "step_time": [step_time, 4], - "loader_time": [loader_time, 2], - "current_lr": current_lr, - } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Plot Training Iter Stats - # reduce TB load - if global_step % config.tb_plot_step == 0: - iter_stats = {"lr": current_lr, "grad_norm": grad_norm, "step_time": step_time} - iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) - - if global_step % config.save_step == 0: - if config.checkpoint: - # save model - save_checkpoint( - model, - optimizer, - global_step, - epoch, - 1, - OUT_PATH, - model_characters, - model_loss=loss_dict["loss"], - ) - - # wait all kernels to be completed - torch.cuda.synchronize() - - # Diagnostic visualizations - if decoder_output is not None: - idx = np.random.randint(mel_targets.shape[0]) - pred_spec = decoder_output[idx].detach().data.cpu().numpy().T - gt_spec = mel_targets[idx].data.cpu().numpy().T - align_img = alignments[idx].data.cpu() - - figures = { - "prediction": plot_spectrogram(pred_spec, ap), - "ground_truth": plot_spectrogram(gt_spec, ap), - "alignment": plot_alignment(align_img), - } - - tb_logger.tb_train_figures(global_step, figures) - - # Sample audio - train_audio = ap.inv_melspectrogram(pred_spec.T) - tb_logger.tb_train_audios(global_step, {"TrainAudio": train_audio}, config.audio["sample_rate"]) - end_time = time.time() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Epoch Stats - if args.rank == 0: - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - if config.tb_model_param_stats: - tb_logger.tb_model_weights(model, global_step) - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(data_loader, model, criterion, ap, global_step, epoch, training_phase): - model.eval() - epoch_time = 0 - keep_avg = KeepAverage() - c_logger.print_eval_start() - if data_loader is not None: - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - text_input, text_lengths, mel_targets, mel_lengths, speaker_c, _, _, _ = format_data(data) - - # forward pass model - with torch.cuda.amp.autocast(enabled=config.mixed_precision): - decoder_output, dur_output, dur_mas_output, alignments, _, _, logp = model.forward( - text_input, text_lengths, mel_targets, mel_lengths, g=speaker_c, phase=training_phase - ) - - # compute loss - loss_dict = criterion( - logp, - decoder_output, - mel_targets, - mel_lengths, - dur_output, - dur_mas_output, - text_lengths, - global_step, - phase=training_phase, - ) - - # step time - step_time = time.time() - start_time - epoch_time += step_time - - # compute alignment score - align_error = 1 - alignment_diagonal_score(alignments, binary=True) - loss_dict["align_error"] = align_error - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["loss_l1"] = reduce_tensor(loss_dict["loss_l1"].data, num_gpus) - loss_dict["loss_ssim"] = reduce_tensor(loss_dict["loss_ssim"].data, num_gpus) - loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - keep_avg.update_values(update_train_values) - - if config.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Diagnostic visualizations - idx = np.random.randint(mel_targets.shape[0]) - pred_spec = decoder_output[idx].detach().data.cpu().numpy().T - gt_spec = mel_targets[idx].data.cpu().numpy().T - align_img = alignments[idx].data.cpu() - - eval_figures = { - "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), - "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), - "alignment": plot_alignment(align_img, output_fig=False), - } - - # Sample audio - eval_audio = ap.inv_melspectrogram(pred_spec.T) - tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, config.audio["sample_rate"]) - - # Plot Validation Stats - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - tb_logger.tb_eval_figures(global_step, eval_figures) - - if args.rank == 0 and epoch >= config.test_delay_epochs: - if config.test_sentences_file: - with open(config.test_sentences_file, "r") as f: - test_sentences = [s.strip() for s in f.readlines()] - else: - test_sentences = [ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963.", - ] - - # test sentences - test_audios = {} - test_figures = {} - print(" | > Synthesizing test sentences") - if config.use_speaker_embedding: - if config.use_external_speaker_embedding_file: - speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[randrange(len(speaker_mapping) - 1)]][ - "embedding" - ] - speaker_id = None - else: - speaker_id = 0 - speaker_embedding = None - else: - speaker_id = None - speaker_embedding = None - - for idx, test_sentence in enumerate(test_sentences): - try: - wav, alignment, _, postnet_output, _, _ = synthesis( - model, - test_sentence, - config, - use_cuda, - ap, - speaker_id=speaker_id, - speaker_embedding=speaker_embedding, - style_wav=None, - truncated=False, - enable_eos_bos_chars=config.enable_eos_bos_chars, # pylint: disable=unused-argument - use_griffin_lim=True, - do_trim_silence=False, - ) - - file_path = os.path.join(AUDIO_PATH, str(global_step)) - os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) - ap.save_wav(wav, file_path) - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-prediction".format(idx)] = plot_spectrogram(postnet_output, ap) - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment) - except: # pylint: disable=bare-except - print(" !! Error creating Test Sentence -", idx) - traceback.print_exc() - tb_logger.tb_test_audios(global_step, test_audios, config.audio["sample_rate"]) - tb_logger.tb_test_figures(global_step, test_figures) - return keep_avg.avg_values - - -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global meta_data_train, meta_data_eval, symbols, phonemes, model_characters, speaker_mapping - # Audio processor - ap = AudioProcessor(**config.audio.to_dict()) - if config.has("characters") and config.characters: - symbols, phonemes = make_symbols(**config.characters.to_dict()) - - # DISTRUBUTED - if num_gpus > 1: - init_distributed(args.rank, num_gpus, args.group_id, config.distributed["backend"], config.distributed["url"]) - - # set model characters - model_characters = phonemes if config.use_phonemes else symbols - num_chars = len(model_characters) - - # load data instances - meta_data_train, meta_data_eval = load_meta_data(config.datasets, eval_split=True) - - # parse speakers - num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(config, args, meta_data_train, OUT_PATH) - - # setup model - model = setup_model(num_chars, num_speakers, config, speaker_embedding_dim=speaker_embedding_dim) - optimizer = RAdam(model.parameters(), lr=config.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9) - criterion = AlignTTSLoss(config) - - if args.restore_path: - print(f" > Restoring from {os.path.basename(args.restore_path)} ...") - checkpoint = torch.load(args.restore_path, map_location="cpu") - try: - # TODO: fix optimizer init, model.cuda() needs to be called before - # optimizer restore - optimizer.load_state_dict(checkpoint["optimizer"]) - if config.reinit_layers: - raise RuntimeError - model.load_state_dict(checkpoint["model"]) - except: # pylint: disable=bare-except - print(" > Partial model initialization.") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], config) - model.load_state_dict(model_dict) - del model_dict - - for group in optimizer.param_groups: - group["initial_lr"] = config.lr - print(" > Model restored from step %d" % checkpoint["step"], flush=True) - args.restore_step = checkpoint["step"] - else: - args.restore_step = 0 - - if use_cuda: - model.cuda() - criterion.cuda() - - # DISTRUBUTED - if num_gpus > 1: - model = DDP_th(model, device_ids=[args.rank]) - - if config.noam_schedule: - scheduler = NoamLR(optimizer, warmup_steps=config.warmup_steps, last_epoch=args.restore_step - 1) - else: - scheduler = None - - num_params = count_parameters(model) - print("\n > Model has {} parameters".format(num_params), flush=True) - - if args.restore_step == 0 or not args.best_path: - best_loss = float("inf") - print(" > Starting with inf best loss.") - else: - print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {best_loss}.") - keep_all_best = config.keep_all_best - keep_after = config.keep_after # void if keep_all_best False - - # define dataloaders - train_loader = setup_loader(ap, 1, is_val=False, verbose=True) - eval_loader = setup_loader(ap, 1, is_val=True, verbose=True) - - global_step = args.restore_step - - def set_phase(): - """Set AlignTTS training phase""" - if isinstance(config.phase_start_steps, list): - vals = [i < global_step for i in config.phase_start_steps] - if not True in vals: - phase = 0 - else: - phase = ( - len(config.phase_start_steps) - - [i < global_step for i in config.phase_start_steps][::-1].index(True) - - 1 - ) - else: - phase = None - return phase - - for epoch in range(0, config.epochs): - cur_phase = set_phase() - print(f"\n > Current AlignTTS phase: {cur_phase}") - c_logger.print_epoch_start(epoch, config.epochs) - train_avg_loss_dict, global_step = train( - train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch, cur_phase - ) - eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch, cur_phase) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = train_avg_loss_dict["avg_loss"] - if config.run_eval: - target_loss = eval_avg_loss_dict["avg_loss"] - best_loss = save_best_model( - target_loss, - best_loss, - model, - optimizer, - global_step, - epoch, - 1, - OUT_PATH, - model_characters, - keep_all_best=keep_all_best, - keep_after=keep_after, - ) - - -if __name__ == "__main__": - args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 48309dc9..2bb5bfc7 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -13,8 +13,8 @@ from TTS.speaker_encoder.dataset import SpeakerEncoderDataset from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_model from TTS.speaker_encoder.utils.visual import plot_embeddings -from TTS.tts.datasets.preprocess import load_meta_data -from TTS.utils.arguments import init_training +from TTS.trainer import init_training +from TTS.tts.datasets import load_meta_data from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import count_parameters, remove_experiment_folder, set_init_dict from TTS.utils.radam import RAdam @@ -164,7 +164,7 @@ def main(args): # pylint: disable=redefined-outer-name elif c.loss == "angleproto": criterion = AngleProtoLoss() elif c.loss == "softmaxproto": - criterion = SoftmaxAngleProtoLoss(c.model["proj_dim"], num_speakers) + criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_speakers) else: raise Exception("The %s not is a loss supported" % c.loss) diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py deleted file mode 100644 index e93a4e8a..00000000 --- a/TTS/bin/train_glow_tts.py +++ /dev/null @@ -1,598 +0,0 @@ -#!/usr/bin/env python3 -"""Train Glow TTS model.""" - -import os -import sys -import time -import traceback -from random import randrange - -import torch - -# DISTRIBUTED -from torch.nn.parallel import DistributedDataParallel as DDP_th -from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler - -from TTS.tts.datasets.preprocess import load_meta_data -from TTS.tts.datasets.TTSDataset import MyDataset -from TTS.tts.layers.losses import GlowTTSLoss -from TTS.tts.utils.generic_utils import setup_model -from TTS.tts.utils.io import save_best_model, save_checkpoint -from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.speakers import parse_speakers -from TTS.tts.utils.synthesis import synthesis -from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.arguments import init_training -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import init_distributed, reduce_tensor -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.radam import RAdam -from TTS.utils.training import NoamLR, setup_torch_training_env - -use_cuda, num_gpus = setup_torch_training_env(True, False) - - -def setup_loader(ap, r, is_val=False, verbose=False): - if is_val and not config.run_eval: - loader = None - else: - dataset = MyDataset( - r, - config.text_cleaner, - compute_linear_spec=False, - meta_data=meta_data_eval if is_val else meta_data_train, - ap=ap, - tp=config.characters, - add_blank=config["add_blank"], - batch_group_size=0 if is_val else config.batch_group_size * config.batch_size, - min_seq_len=config.min_seq_len, - max_seq_len=config.max_seq_len, - phoneme_cache_path=config.phoneme_cache_path, - use_phonemes=config.use_phonemes, - phoneme_language=config.phoneme_language, - enable_eos_bos=config.enable_eos_bos_chars, - use_noise_augment=not is_val, - verbose=verbose, - speaker_mapping=speaker_mapping - if config.use_speaker_embedding and config.use_external_speaker_embedding_file - else None, - ) - - if config.use_phonemes and config.compute_input_seq_cache: - # precompute phonemes to have a better estimate of sequence lengths. - dataset.compute_input_seq(config.num_loader_workers) - dataset.sort_items() - - sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=config.eval_batch_size if is_val else config.batch_size, - shuffle=False, - collate_fn=dataset.collate_fn, - drop_last=False, - sampler=sampler, - num_workers=config.num_val_loader_workers if is_val else config.num_loader_workers, - pin_memory=False, - ) - return loader - - -def format_data(data): - # setup input data - text_input = data[0] - text_lengths = data[1] - speaker_names = data[2] - mel_input = data[4].permute(0, 2, 1) # B x D x T - mel_lengths = data[5] - item_idx = data[7] - attn_mask = data[9] - avg_text_length = torch.mean(text_lengths.float()) - avg_spec_length = torch.mean(mel_lengths.float()) - - if config.use_speaker_embedding: - if config.use_external_speaker_embedding_file: - # return precomputed embedding vector - speaker_c = data[8] - else: - # return speaker_id to be used by an embedding layer - speaker_c = [speaker_mapping[speaker_name] for speaker_name in speaker_names] - speaker_c = torch.LongTensor(speaker_c) - else: - speaker_c = None - - # dispatch data to GPU - if use_cuda: - text_input = text_input.cuda(non_blocking=True) - text_lengths = text_lengths.cuda(non_blocking=True) - mel_input = mel_input.cuda(non_blocking=True) - mel_lengths = mel_lengths.cuda(non_blocking=True) - if speaker_c is not None: - speaker_c = speaker_c.cuda(non_blocking=True) - if attn_mask is not None: - attn_mask = attn_mask.cuda(non_blocking=True) - return ( - text_input, - text_lengths, - mel_input, - mel_lengths, - speaker_c, - avg_text_length, - avg_spec_length, - attn_mask, - item_idx, - ) - - -def data_depended_init(data_loader, model): - """Data depended initialization for activation normalization.""" - if hasattr(model, "module"): - for f in model.module.decoder.flows: - if getattr(f, "set_ddi", False): - f.set_ddi(True) - else: - for f in model.decoder.flows: - if getattr(f, "set_ddi", False): - f.set_ddi(True) - - model.train() - print(" > Data depended initialization ... ") - num_iter = 0 - with torch.no_grad(): - for _, data in enumerate(data_loader): - - # format data - text_input, text_lengths, mel_input, mel_lengths, spekaer_embed, _, _, attn_mask, _ = format_data(data) - - # forward pass model - _ = model.forward(text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=spekaer_embed) - if num_iter == config.data_dep_init_steps: - break - num_iter += 1 - - if hasattr(model, "module"): - for f in model.module.decoder.flows: - if getattr(f, "set_ddi", False): - f.set_ddi(False) - else: - for f in model.decoder.flows: - if getattr(f, "set_ddi", False): - f.set_ddi(False) - return model - - -def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch): - - model.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (config.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / config.batch_size) - end_time = time.time() - c_logger.print_train_start() - scaler = torch.cuda.amp.GradScaler() if config.mixed_precision else None - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - ( - text_input, - text_lengths, - mel_input, - mel_lengths, - speaker_c, - avg_text_length, - avg_spec_length, - attn_mask, - _, - ) = format_data(data) - - loader_time = time.time() - end_time - - global_step += 1 - optimizer.zero_grad() - - # forward pass model - with torch.cuda.amp.autocast(enabled=config.mixed_precision): - z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward( - text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_c - ) - - # compute loss - loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, o_dur_log, o_total_dur, text_lengths) - - # backward pass with loss scaling - if config.mixed_precision: - scaler.scale(loss_dict["loss"]).backward() - scaler.unscale_(optimizer) - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) - scaler.step(optimizer) - scaler.update() - else: - loss_dict["loss"].backward() - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) - optimizer.step() - - # setup lr - if config.noam_schedule: - scheduler.step() - - # current_lr - current_lr = optimizer.param_groups[0]["lr"] - - # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(alignments, binary=True) - loss_dict["align_error"] = align_error - - step_time = time.time() - start_time - epoch_time += step_time - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["log_mle"] = reduce_tensor(loss_dict["log_mle"].data, num_gpus) - loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - keep_avg.update_values(update_train_values) - - # print training progress - if global_step % config.print_step == 0: - log_dict = { - "avg_spec_length": [avg_spec_length, 1], # value, precision - "avg_text_length": [avg_text_length, 1], - "step_time": [step_time, 4], - "loader_time": [loader_time, 2], - "current_lr": current_lr, - } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Plot Training Iter Stats - # reduce TB load - if global_step % config.tb_plot_step == 0: - iter_stats = {"lr": current_lr, "grad_norm": grad_norm, "step_time": step_time} - iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) - - if global_step % config.save_step == 0: - if config.checkpoint: - # save model - save_checkpoint( - model, - optimizer, - global_step, - epoch, - 1, - OUT_PATH, - model_characters, - model_loss=loss_dict["loss"], - ) - - # wait all kernels to be completed - torch.cuda.synchronize() - - # Diagnostic visualizations - # direct pass on model for spec predictions - target_speaker = None if speaker_c is None else speaker_c[:1] - - if hasattr(model, "module"): - spec_pred, *_ = model.module.inference(text_input[:1], text_lengths[:1], g=target_speaker) - else: - spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1], g=target_speaker) - - spec_pred = spec_pred.permute(0, 2, 1) - gt_spec = mel_input.permute(0, 2, 1) - const_spec = spec_pred[0].data.cpu().numpy() - gt_spec = gt_spec[0].data.cpu().numpy() - align_img = alignments[0].data.cpu().numpy() - - figures = { - "prediction": plot_spectrogram(const_spec, ap), - "ground_truth": plot_spectrogram(gt_spec, ap), - "alignment": plot_alignment(align_img), - } - - tb_logger.tb_train_figures(global_step, figures) - - # Sample audio - train_audio = ap.inv_melspectrogram(const_spec.T) - tb_logger.tb_train_audios(global_step, {"TrainAudio": train_audio}, config.audio["sample_rate"]) - end_time = time.time() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Epoch Stats - if args.rank == 0: - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - if config.tb_model_param_stats: - tb_logger.tb_model_weights(model, global_step) - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(data_loader, model, criterion, ap, global_step, epoch): - model.eval() - epoch_time = 0 - keep_avg = KeepAverage() - c_logger.print_eval_start() - if data_loader is not None: - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - text_input, text_lengths, mel_input, mel_lengths, speaker_c, _, _, attn_mask, _ = format_data(data) - - # forward pass model - z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward( - text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_c - ) - - # compute loss - loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, o_dur_log, o_total_dur, text_lengths) - - # step time - step_time = time.time() - start_time - epoch_time += step_time - - # compute alignment score - align_error = 1 - alignment_diagonal_score(alignments) - loss_dict["align_error"] = align_error - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["log_mle"] = reduce_tensor(loss_dict["log_mle"].data, num_gpus) - loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - keep_avg.update_values(update_train_values) - - if config.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Diagnostic visualizations - # direct pass on model for spec predictions - target_speaker = None if speaker_c is None else speaker_c[:1] - if hasattr(model, "module"): - spec_pred, *_ = model.module.inference(text_input[:1], text_lengths[:1], g=target_speaker) - else: - spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1], g=target_speaker) - spec_pred = spec_pred.permute(0, 2, 1) - gt_spec = mel_input.permute(0, 2, 1) - - const_spec = spec_pred[0].data.cpu().numpy() - gt_spec = gt_spec[0].data.cpu().numpy() - align_img = alignments[0].data.cpu().numpy() - - eval_figures = { - "prediction": plot_spectrogram(const_spec, ap), - "ground_truth": plot_spectrogram(gt_spec, ap), - "alignment": plot_alignment(align_img), - } - - # Sample audio - eval_audio = ap.inv_melspectrogram(const_spec.T) - tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, config.audio["sample_rate"]) - - # Plot Validation Stats - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - tb_logger.tb_eval_figures(global_step, eval_figures) - - if args.rank == 0 and epoch >= config.test_delay_epochs: - if config.test_sentences_file: - with open(config.test_sentences_file, "r") as f: - test_sentences = [s.strip() for s in f.readlines()] - else: - test_sentences = [ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963.", - ] - - # test sentences - test_audios = {} - test_figures = {} - print(" | > Synthesizing test sentences") - if config.use_speaker_embedding: - if config.use_external_speaker_embedding_file: - speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[randrange(len(speaker_mapping) - 1)]][ - "embedding" - ] - speaker_id = None - else: - speaker_id = 0 - speaker_embedding = None - else: - speaker_id = None - speaker_embedding = None - - style_wav = config.style_wav_for_test - for idx, test_sentence in enumerate(test_sentences): - try: - wav, alignment, _, postnet_output, _, _ = synthesis( - model, - test_sentence, - config, - use_cuda, - ap, - speaker_id=speaker_id, - speaker_embedding=speaker_embedding, - style_wav=style_wav, - truncated=False, - enable_eos_bos_chars=config.enable_eos_bos_chars, # pylint: disable=unused-argument - use_griffin_lim=True, - do_trim_silence=False, - ) - - file_path = os.path.join(AUDIO_PATH, str(global_step)) - os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) - ap.save_wav(wav, file_path) - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-prediction".format(idx)] = plot_spectrogram(postnet_output, ap) - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment) - except: # pylint: disable=bare-except - print(" !! Error creating Test Sentence -", idx) - traceback.print_exc() - tb_logger.tb_test_audios(global_step, test_audios, config.audio["sample_rate"]) - tb_logger.tb_test_figures(global_step, test_figures) - return keep_avg.avg_values - - -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global meta_data_train, meta_data_eval, symbols, phonemes, model_characters, speaker_mapping - # Audio processor - ap = AudioProcessor(**config.audio.to_dict()) - if config.has("characters") and config.characters: - symbols, phonemes = make_symbols(**config.characters.to_dict()) - - # DISTRUBUTED - if num_gpus > 1: - init_distributed(args.rank, num_gpus, args.group_id, config.distributed["backend"], config.distributed["url"]) - - # set model characters - model_characters = phonemes if config.use_phonemes else symbols - num_chars = len(model_characters) - - # load data instances - meta_data_train, meta_data_eval = load_meta_data(config.datasets) - - # parse speakers - num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(config, args, meta_data_train, OUT_PATH) - - # setup model - model = setup_model(num_chars, num_speakers, config, speaker_embedding_dim=speaker_embedding_dim) - optimizer = RAdam(model.parameters(), lr=config.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9) - criterion = GlowTTSLoss() - - if args.restore_path: - print(f" > Restoring from {os.path.basename(args.restore_path)} ...") - checkpoint = torch.load(args.restore_path, map_location="cpu") - try: - # TODO: fix optimizer init, model.cuda() needs to be called before - # optimizer restore - optimizer.load_state_dict(checkpoint["optimizer"]) - model.load_state_dict(checkpoint["model"]) - except: # pylint: disable=bare-except - print(" > Partial model initialization.") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], config) - model.load_state_dict(model_dict) - del model_dict - - for group in optimizer.param_groups: - group["initial_lr"] = config.lr - print(f" > Model restored from step {checkpoint['step']:d}", flush=True) - args.restore_step = checkpoint["step"] - else: - args.restore_step = 0 - - if use_cuda: - model.cuda() - criterion.cuda() - - # DISTRUBUTED - if num_gpus > 1: - model = DDP_th(model, device_ids=[args.rank]) - - if config.noam_schedule: - scheduler = NoamLR(optimizer, warmup_steps=config.warmup_steps, last_epoch=args.restore_step - 1) - else: - scheduler = None - - num_params = count_parameters(model) - print("\n > Model has {} parameters".format(num_params), flush=True) - - if args.restore_step == 0 or not args.best_path: - best_loss = float("inf") - print(" > Starting with inf best loss.") - else: - print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {best_loss}.") - keep_all_best = config.keep_all_best - keep_after = config.keep_after # void if keep_all_best False - - # define dataloaders - train_loader = setup_loader(ap, 1, is_val=False, verbose=True) - eval_loader = setup_loader(ap, 1, is_val=True, verbose=True) - - global_step = args.restore_step - model = data_depended_init(train_loader, model) - for epoch in range(0, config.epochs): - c_logger.print_epoch_start(epoch, config.epochs) - train_avg_loss_dict, global_step = train( - train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch - ) - eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = train_avg_loss_dict["avg_loss"] - if config.run_eval: - target_loss = eval_avg_loss_dict["avg_loss"] - best_loss = save_best_model( - target_loss, - best_loss, - model, - optimizer, - global_step, - epoch, - config.r, - OUT_PATH, - model_characters, - keep_all_best=keep_all_best, - keep_after=keep_after, - ) - - -if __name__ == "__main__": - args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py deleted file mode 100644 index 2fba3df1..00000000 --- a/TTS/bin/train_speedy_speech.py +++ /dev/null @@ -1,578 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import os -import sys -import time -import traceback -from random import randrange - -import numpy as np -import torch - -# DISTRIBUTED -from torch.nn.parallel import DistributedDataParallel as DDP_th -from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler - -from TTS.tts.datasets.preprocess import load_meta_data -from TTS.tts.datasets.TTSDataset import MyDataset -from TTS.tts.layers.losses import SpeedySpeechLoss -from TTS.tts.utils.generic_utils import setup_model -from TTS.tts.utils.io import save_best_model, save_checkpoint -from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.speakers import parse_speakers -from TTS.tts.utils.synthesis import synthesis -from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.arguments import init_training -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import init_distributed, reduce_tensor -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.radam import RAdam -from TTS.utils.training import NoamLR, setup_torch_training_env - -use_cuda, num_gpus = setup_torch_training_env(True, False) - - -def setup_loader(ap, r, is_val=False, verbose=False): - if is_val and not config.run_eval: - loader = None - else: - dataset = MyDataset( - r, - config.text_cleaner, - compute_linear_spec=False, - meta_data=meta_data_eval if is_val else meta_data_train, - ap=ap, - tp=config.characters, - add_blank=config["add_blank"], - batch_group_size=0 if is_val else config.batch_group_size * config.batch_size, - min_seq_len=config.min_seq_len, - max_seq_len=config.max_seq_len, - phoneme_cache_path=config.phoneme_cache_path, - use_phonemes=config.use_phonemes, - phoneme_language=config.phoneme_language, - enable_eos_bos=config.enable_eos_bos_chars, - use_noise_augment=not is_val, - verbose=verbose, - speaker_mapping=speaker_mapping - if config.use_speaker_embedding and config.use_external_speaker_embedding_file - else None, - ) - - if config.use_phonemes and config.compute_input_seq_cache: - # precompute phonemes to have a better estimate of sequence lengths. - dataset.compute_input_seq(config.num_loader_workers) - dataset.sort_items() - - sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=config.eval_batch_size if is_val else config.batch_size, - shuffle=False, - collate_fn=dataset.collate_fn, - drop_last=False, - sampler=sampler, - num_workers=config.num_val_loader_workers if is_val else config.num_loader_workers, - pin_memory=False, - ) - return loader - - -def format_data(data): - # setup input data - text_input = data[0] - text_lengths = data[1] - speaker_names = data[2] - mel_input = data[4].permute(0, 2, 1) # B x D x T - mel_lengths = data[5] - item_idx = data[7] - attn_mask = data[9] - avg_text_length = torch.mean(text_lengths.float()) - avg_spec_length = torch.mean(mel_lengths.float()) - - if config.use_speaker_embedding: - if config.use_external_speaker_embedding_file: - # return precomputed embedding vector - speaker_c = data[8] - else: - # return speaker_id to be used by an embedding layer - speaker_c = [speaker_mapping[speaker_name] for speaker_name in speaker_names] - speaker_c = torch.LongTensor(speaker_c) - else: - speaker_c = None - # compute durations from attention mask - durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2]) - for idx, am in enumerate(attn_mask): - # compute raw durations - c_idxs = am[:, : text_lengths[idx], : mel_lengths[idx]].max(1)[1] - # c_idxs, counts = torch.unique_consecutive(c_idxs, return_counts=True) - c_idxs, counts = torch.unique(c_idxs, return_counts=True) - dur = torch.ones([text_lengths[idx]]).to(counts.dtype) - dur[c_idxs] = counts - # smooth the durations and set any 0 duration to 1 - # by cutting off from the largest duration indeces. - extra_frames = dur.sum() - mel_lengths[idx] - largest_idxs = torch.argsort(-dur)[:extra_frames] - dur[largest_idxs] -= 1 - assert ( - dur.sum() == mel_lengths[idx] - ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" - durations[idx, : text_lengths[idx]] = dur - # dispatch data to GPU - if use_cuda: - text_input = text_input.cuda(non_blocking=True) - text_lengths = text_lengths.cuda(non_blocking=True) - mel_input = mel_input.cuda(non_blocking=True) - mel_lengths = mel_lengths.cuda(non_blocking=True) - if speaker_c is not None: - speaker_c = speaker_c.cuda(non_blocking=True) - attn_mask = attn_mask.cuda(non_blocking=True) - durations = durations.cuda(non_blocking=True) - return ( - text_input, - text_lengths, - mel_input, - mel_lengths, - speaker_c, - avg_text_length, - avg_spec_length, - attn_mask, - durations, - item_idx, - ) - - -def train(data_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch): - - model.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (config.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / config.batch_size) - end_time = time.time() - c_logger.print_train_start() - scaler = torch.cuda.amp.GradScaler() if config.mixed_precision else None - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - ( - text_input, - text_lengths, - mel_targets, - mel_lengths, - speaker_c, - avg_text_length, - avg_spec_length, - _, - dur_target, - _, - ) = format_data(data) - - loader_time = time.time() - end_time - - global_step += 1 - optimizer.zero_grad() - - # forward pass model - with torch.cuda.amp.autocast(enabled=config.mixed_precision): - decoder_output, dur_output, alignments = model.forward( - text_input, text_lengths, mel_lengths, dur_target, g=speaker_c - ) - - # compute loss - loss_dict = criterion( - decoder_output, mel_targets, mel_lengths, dur_output, torch.log(1 + dur_target), text_lengths - ) - - # backward pass with loss scaling - if config.mixed_precision: - scaler.scale(loss_dict["loss"]).backward() - scaler.unscale_(optimizer) - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) - scaler.step(optimizer) - scaler.update() - else: - loss_dict["loss"].backward() - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) - optimizer.step() - - # setup lr - if config.noam_schedule: - scheduler.step() - - # current_lr - current_lr = optimizer.param_groups[0]["lr"] - - # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(alignments, binary=True) - loss_dict["align_error"] = align_error - - step_time = time.time() - start_time - epoch_time += step_time - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["loss_l1"] = reduce_tensor(loss_dict["loss_l1"].data, num_gpus) - loss_dict["loss_ssim"] = reduce_tensor(loss_dict["loss_ssim"].data, num_gpus) - loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - keep_avg.update_values(update_train_values) - - # print training progress - if global_step % config.print_step == 0: - log_dict = { - "avg_spec_length": [avg_spec_length, 1], # value, precision - "avg_text_length": [avg_text_length, 1], - "step_time": [step_time, 4], - "loader_time": [loader_time, 2], - "current_lr": current_lr, - } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Plot Training Iter Stats - # reduce TB load - if global_step % config.tb_plot_step == 0: - iter_stats = {"lr": current_lr, "grad_norm": grad_norm, "step_time": step_time} - iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) - - if global_step % config.save_step == 0: - if config.checkpoint: - # save model - save_checkpoint( - model, - optimizer, - global_step, - epoch, - 1, - OUT_PATH, - model_characters, - model_loss=loss_dict["loss"], - ) - - # wait all kernels to be completed - torch.cuda.synchronize() - - # Diagnostic visualizations - idx = np.random.randint(mel_targets.shape[0]) - pred_spec = decoder_output[idx].detach().data.cpu().numpy().T - gt_spec = mel_targets[idx].data.cpu().numpy().T - align_img = alignments[idx].data.cpu() - - figures = { - "prediction": plot_spectrogram(pred_spec, ap), - "ground_truth": plot_spectrogram(gt_spec, ap), - "alignment": plot_alignment(align_img), - } - - tb_logger.tb_train_figures(global_step, figures) - - # Sample audio - train_audio = ap.inv_melspectrogram(pred_spec.T) - tb_logger.tb_train_audios(global_step, {"TrainAudio": train_audio}, config.audio["sample_rate"]) - end_time = time.time() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Epoch Stats - if args.rank == 0: - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - if config.tb_model_param_stats: - tb_logger.tb_model_weights(model, global_step) - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(data_loader, model, criterion, ap, global_step, epoch): - model.eval() - epoch_time = 0 - keep_avg = KeepAverage() - c_logger.print_eval_start() - if data_loader is not None: - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - text_input, text_lengths, mel_targets, mel_lengths, speaker_c, _, _, _, dur_target, _ = format_data(data) - - # forward pass model - with torch.cuda.amp.autocast(enabled=config.mixed_precision): - decoder_output, dur_output, alignments = model.forward( - text_input, text_lengths, mel_lengths, dur_target, g=speaker_c - ) - - # compute loss - loss_dict = criterion( - decoder_output, mel_targets, mel_lengths, dur_output, torch.log(1 + dur_target), text_lengths - ) - - # step time - step_time = time.time() - start_time - epoch_time += step_time - - # compute alignment score - align_error = 1 - alignment_diagonal_score(alignments, binary=True) - loss_dict["align_error"] = align_error - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["loss_l1"] = reduce_tensor(loss_dict["loss_l1"].data, num_gpus) - loss_dict["loss_ssim"] = reduce_tensor(loss_dict["loss_ssim"].data, num_gpus) - loss_dict["loss_dur"] = reduce_tensor(loss_dict["loss_dur"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - keep_avg.update_values(update_train_values) - - if config.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Diagnostic visualizations - idx = np.random.randint(mel_targets.shape[0]) - pred_spec = decoder_output[idx].detach().data.cpu().numpy().T - gt_spec = mel_targets[idx].data.cpu().numpy().T - align_img = alignments[idx].data.cpu() - - eval_figures = { - "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), - "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), - "alignment": plot_alignment(align_img, output_fig=False), - } - - # Sample audio - eval_audio = ap.inv_melspectrogram(pred_spec.T) - tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, config.audio["sample_rate"]) - - # Plot Validation Stats - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - tb_logger.tb_eval_figures(global_step, eval_figures) - - if args.rank == 0 and epoch >= config.test_delay_epochs: - if config.test_sentences_file: - with open(config.test_sentences_file, "r") as f: - test_sentences = [s.strip() for s in f.readlines()] - else: - test_sentences = [ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963.", - ] - - # test sentences - test_audios = {} - test_figures = {} - print(" | > Synthesizing test sentences") - if config.use_speaker_embedding: - if config.use_external_speaker_embedding_file: - speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[randrange(len(speaker_mapping) - 1)]][ - "embedding" - ] - speaker_id = None - else: - speaker_id = 0 - speaker_embedding = None - else: - speaker_id = None - speaker_embedding = None - - for idx, test_sentence in enumerate(test_sentences): - try: - wav, alignment, _, postnet_output, _, _ = synthesis( - model, - test_sentence, - config, - use_cuda, - ap, - speaker_id=speaker_id, - speaker_embedding=speaker_embedding, - style_wav=None, - truncated=False, - enable_eos_bos_chars=config.enable_eos_bos_chars, # pylint: disable=unused-argument - use_griffin_lim=True, - do_trim_silence=False, - ) - - file_path = os.path.join(AUDIO_PATH, str(global_step)) - os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) - ap.save_wav(wav, file_path) - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-prediction".format(idx)] = plot_spectrogram(postnet_output, ap) - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment) - except: # pylint: disable=bare-except - print(" !! Error creating Test Sentence -", idx) - traceback.print_exc() - tb_logger.tb_test_audios(global_step, test_audios, config.audio["sample_rate"]) - tb_logger.tb_test_figures(global_step, test_figures) - return keep_avg.avg_values - - -# FIXME: move args definition/parsing inside of main? -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global meta_data_train, meta_data_eval, symbols, phonemes, model_characters, speaker_mapping - # Audio processor - ap = AudioProcessor(**config.audio.to_dict()) - if config.characters is not None: - symbols, phonemes = make_symbols(**config.characters.to_dict()) - - # DISTRUBUTED - if num_gpus > 1: - init_distributed(args.rank, num_gpus, args.group_id, config.distributed["backend"], config.distributed["url"]) - - # set model characters - model_characters = phonemes if config.use_phonemes else symbols - num_chars = len(model_characters) - - # load data instances - meta_data_train, meta_data_eval = load_meta_data(config.datasets, eval_split=True) - - # set the portion of the data used for training if set in config.json - if config.has("train_portion"): - meta_data_train = meta_data_train[: int(len(meta_data_train) * config.train_portion)] - if config.has("eval_portion"): - meta_data_eval = meta_data_eval[: int(len(meta_data_eval) * config.eval_portion)] - - # parse speakers - num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(config, args, meta_data_train, OUT_PATH) - - # setup model - model = setup_model(num_chars, num_speakers, config, speaker_embedding_dim=speaker_embedding_dim) - optimizer = RAdam(model.parameters(), lr=config.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9) - criterion = SpeedySpeechLoss(config) - - if args.restore_path: - print(f" > Restoring from {os.path.basename(args.restore_path)} ...") - checkpoint = torch.load(args.restore_path, map_location="cpu") - try: - # TODO: fix optimizer init, model.cuda() needs to be called before - # optimizer restore - optimizer.load_state_dict(checkpoint["optimizer"]) - if config.reinit_layers: - raise RuntimeError - model.load_state_dict(checkpoint["model"]) - except: # pylint: disable=bare-except - print(" > Partial model initialization.") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], config) - model.load_state_dict(model_dict) - del model_dict - - for group in optimizer.param_groups: - group["initial_lr"] = config.lr - print(" > Model restored from step %d" % checkpoint["step"], flush=True) - args.restore_step = checkpoint["step"] - else: - args.restore_step = 0 - - if use_cuda: - model.cuda() - criterion.cuda() - - # DISTRUBUTED - if num_gpus > 1: - model = DDP_th(model, device_ids=[args.rank]) - - if config.noam_schedule: - scheduler = NoamLR(optimizer, warmup_steps=config.warmup_steps, last_epoch=args.restore_step - 1) - else: - scheduler = None - - num_params = count_parameters(model) - print("\n > Model has {} parameters".format(num_params), flush=True) - - if args.restore_step == 0 or not args.best_path: - best_loss = float("inf") - print(" > Starting with inf best loss.") - else: - print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {best_loss}.") - keep_all_best = config.keep_all_best - keep_after = config.keep_after # void if keep_all_best False - - # define dataloaders - train_loader = setup_loader(ap, 1, is_val=False, verbose=True) - eval_loader = setup_loader(ap, 1, is_val=True, verbose=True) - - global_step = args.restore_step - for epoch in range(0, config.epochs): - c_logger.print_epoch_start(epoch, config.epochs) - train_avg_loss_dict, global_step = train( - train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch - ) - eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = train_avg_loss_dict["avg_loss"] - if config.run_eval: - target_loss = eval_avg_loss_dict["avg_loss"] - best_loss = save_best_model( - target_loss, - best_loss, - model, - optimizer, - global_step, - epoch, - config.r, - OUT_PATH, - model_characters, - keep_all_best=keep_all_best, - keep_after=keep_after, - ) - - -if __name__ == "__main__": - args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py deleted file mode 100755 index 9685d0d7..00000000 --- a/TTS/bin/train_tacotron.py +++ /dev/null @@ -1,749 +0,0 @@ -#!/usr/bin/env python3 -"""Trains Tacotron based TTS models.""" - -import os -import sys -import time -import traceback -from random import randrange - -import numpy as np -import torch -from torch.utils.data import DataLoader - -from TTS.tts.datasets.preprocess import load_meta_data -from TTS.tts.datasets.TTSDataset import MyDataset -from TTS.tts.layers.losses import TacotronLoss -from TTS.tts.utils.generic_utils import setup_model -from TTS.tts.utils.io import save_best_model, save_checkpoint -from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.speakers import parse_speakers -from TTS.tts.utils.synthesis import synthesis -from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols -from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.arguments import init_training -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import DistributedSampler, apply_gradient_allreduce, init_distributed, reduce_tensor -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.radam import RAdam -from TTS.utils.training import ( - NoamLR, - adam_weight_decay, - check_update, - gradual_training_scheduler, - set_weight_decay, - setup_torch_training_env, -) - -use_cuda, num_gpus = setup_torch_training_env(True, False) - - -def setup_loader(ap, r, is_val=False, verbose=False, dataset=None): - if is_val and not config.run_eval: - loader = None - else: - if dataset is None: - dataset = MyDataset( - r, - config.text_cleaner, - compute_linear_spec=config.model.lower() == "tacotron", - meta_data=meta_data_eval if is_val else meta_data_train, - ap=ap, - tp=config.characters, - add_blank=config["add_blank"], - batch_group_size=0 if is_val else config.batch_group_size * config.batch_size, - min_seq_len=config.min_seq_len, - max_seq_len=config.max_seq_len, - phoneme_cache_path=config.phoneme_cache_path, - use_phonemes=config.use_phonemes, - phoneme_language=config.phoneme_language, - enable_eos_bos=config.enable_eos_bos_chars, - verbose=verbose, - speaker_mapping=( - speaker_mapping - if (config.use_speaker_embedding and config.use_external_speaker_embedding_file) - else None - ), - ) - - if config.use_phonemes and config.compute_input_seq_cache: - # precompute phonemes to have a better estimate of sequence lengths. - dataset.compute_input_seq(config.num_loader_workers) - dataset.sort_items() - - sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=config.eval_batch_size if is_val else config.batch_size, - shuffle=False, - collate_fn=dataset.collate_fn, - drop_last=False, - sampler=sampler, - num_workers=config.num_val_loader_workers if is_val else config.num_loader_workers, - pin_memory=False, - ) - return loader - - -def format_data(data): - # setup input data - text_input = data[0] - text_lengths = data[1] - speaker_names = data[2] - linear_input = data[3] if config.model.lower() in ["tacotron"] else None - mel_input = data[4] - mel_lengths = data[5] - stop_targets = data[6] - max_text_length = torch.max(text_lengths.float()) - max_spec_length = torch.max(mel_lengths.float()) - - if config.use_speaker_embedding: - if config.use_external_speaker_embedding_file: - speaker_embeddings = data[8] - speaker_ids = None - else: - speaker_ids = [speaker_mapping[speaker_name] for speaker_name in speaker_names] - speaker_ids = torch.LongTensor(speaker_ids) - speaker_embeddings = None - else: - speaker_embeddings = None - speaker_ids = None - - # set stop targets view, we predict a single stop token per iteration. - stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // config.r, -1) - stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) - - # dispatch data to GPU - if use_cuda: - text_input = text_input.cuda(non_blocking=True) - text_lengths = text_lengths.cuda(non_blocking=True) - mel_input = mel_input.cuda(non_blocking=True) - mel_lengths = mel_lengths.cuda(non_blocking=True) - linear_input = linear_input.cuda(non_blocking=True) if config.model.lower() in ["tacotron"] else None - stop_targets = stop_targets.cuda(non_blocking=True) - if speaker_ids is not None: - speaker_ids = speaker_ids.cuda(non_blocking=True) - if speaker_embeddings is not None: - speaker_embeddings = speaker_embeddings.cuda(non_blocking=True) - - return ( - text_input, - text_lengths, - mel_input, - mel_lengths, - linear_input, - stop_targets, - speaker_ids, - speaker_embeddings, - max_text_length, - max_spec_length, - ) - - -def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler, ap, global_step, epoch, scaler, scaler_st): - model.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (config.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / config.batch_size) - end_time = time.time() - c_logger.print_train_start() - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - ( - text_input, - text_lengths, - mel_input, - mel_lengths, - linear_input, - stop_targets, - speaker_ids, - speaker_embeddings, - max_text_length, - max_spec_length, - ) = format_data(data) - loader_time = time.time() - end_time - - global_step += 1 - - # setup lr - if config.noam_schedule: - scheduler.step() - - optimizer.zero_grad() - if optimizer_st: - optimizer_st.zero_grad() - - with torch.cuda.amp.autocast(enabled=config.mixed_precision): - # forward pass model - if config.bidirectional_decoder or config.double_decoder_consistency: - ( - decoder_output, - postnet_output, - alignments, - stop_tokens, - decoder_backward_output, - alignments_backward, - ) = model( - text_input, - text_lengths, - mel_input, - mel_lengths, - speaker_ids=speaker_ids, - speaker_embeddings=speaker_embeddings, - ) - else: - decoder_output, postnet_output, alignments, stop_tokens = model( - text_input, - text_lengths, - mel_input, - mel_lengths, - speaker_ids=speaker_ids, - speaker_embeddings=speaker_embeddings, - ) - decoder_backward_output = None - alignments_backward = None - - # set the [alignment] lengths wrt reduction factor for guided attention - if mel_lengths.max() % model.decoder.r != 0: - alignment_lengths = ( - mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r)) - ) // model.decoder.r - else: - alignment_lengths = mel_lengths // model.decoder.r - - # compute loss - loss_dict = criterion( - postnet_output, - decoder_output, - mel_input, - linear_input, - stop_tokens, - stop_targets, - mel_lengths, - decoder_backward_output, - alignments, - alignment_lengths, - alignments_backward, - text_lengths, - ) - - # check nan loss - if torch.isnan(loss_dict["loss"]).any(): - raise RuntimeError(f"Detected NaN loss at step {global_step}.") - - # optimizer step - if config.mixed_precision: - # model optimizer step in mixed precision mode - scaler.scale(loss_dict["loss"]).backward() - scaler.unscale_(optimizer) - optimizer, current_lr = adam_weight_decay(optimizer) - grad_norm, _ = check_update(model, config.grad_clip, ignore_stopnet=True) - scaler.step(optimizer) - scaler.update() - - # stopnet optimizer step - if config.separate_stopnet: - scaler_st.scale(loss_dict["stopnet_loss"]).backward() - scaler.unscale_(optimizer_st) - optimizer_st, _ = adam_weight_decay(optimizer_st) - grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) - scaler_st.step(optimizer) - scaler_st.update() - else: - grad_norm_st = 0 - else: - # main model optimizer step - loss_dict["loss"].backward() - optimizer, current_lr = adam_weight_decay(optimizer) - grad_norm, _ = check_update(model, config.grad_clip, ignore_stopnet=True) - optimizer.step() - - # stopnet optimizer step - if config.separate_stopnet: - loss_dict["stopnet_loss"].backward() - optimizer_st, _ = adam_weight_decay(optimizer_st) - grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) - optimizer_st.step() - else: - grad_norm_st = 0 - - # compute alignment error (the lower the better ) - align_error = 1 - alignment_diagonal_score(alignments) - loss_dict["align_error"] = align_error - - step_time = time.time() - start_time - epoch_time += step_time - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["postnet_loss"] = reduce_tensor(loss_dict["postnet_loss"].data, num_gpus) - loss_dict["decoder_loss"] = reduce_tensor(loss_dict["decoder_loss"].data, num_gpus) - loss_dict["loss"] = reduce_tensor(loss_dict["loss"].data, num_gpus) - loss_dict["stopnet_loss"] = ( - reduce_tensor(loss_dict["stopnet_loss"].data, num_gpus) if config.stopnet else loss_dict["stopnet_loss"] - ) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - keep_avg.update_values(update_train_values) - - # print training progress - if global_step % config.print_step == 0: - log_dict = { - "max_spec_length": [max_spec_length, 1], # value, precision - "max_text_length": [max_text_length, 1], - "step_time": [step_time, 4], - "loader_time": [loader_time, 2], - "current_lr": current_lr, - } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Plot Training Iter Stats - # reduce TB load - if global_step % config.tb_plot_step == 0: - iter_stats = { - "lr": current_lr, - "grad_norm": grad_norm, - "grad_norm_st": grad_norm_st, - "step_time": step_time, - } - iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) - - if global_step % config.save_step == 0: - if config.checkpoint: - # save model - save_checkpoint( - model, - optimizer, - global_step, - epoch, - model.decoder.r, - OUT_PATH, - optimizer_st=optimizer_st, - model_loss=loss_dict["postnet_loss"], - characters=model_characters, - scaler=scaler.state_dict() if config.mixed_precision else None, - ) - - # Diagnostic visualizations - const_spec = postnet_output[0].data.cpu().numpy() - gt_spec = ( - linear_input[0].data.cpu().numpy() - if config.model in ["Tacotron", "TacotronGST"] - else mel_input[0].data.cpu().numpy() - ) - align_img = alignments[0].data.cpu().numpy() - - figures = { - "prediction": plot_spectrogram(const_spec, ap, output_fig=False), - "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), - "alignment": plot_alignment(align_img, output_fig=False), - } - - if config.bidirectional_decoder or config.double_decoder_consistency: - figures["alignment_backward"] = plot_alignment( - alignments_backward[0].data.cpu().numpy(), output_fig=False - ) - - tb_logger.tb_train_figures(global_step, figures) - - # Sample audio - if config.model in ["Tacotron", "TacotronGST"]: - train_audio = ap.inv_spectrogram(const_spec.T) - else: - train_audio = ap.inv_melspectrogram(const_spec.T) - tb_logger.tb_train_audios(global_step, {"TrainAudio": train_audio}, config.audio["sample_rate"]) - end_time = time.time() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Epoch Stats - if args.rank == 0: - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - if config.tb_model_param_stats: - tb_logger.tb_model_weights(model, global_step) - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(data_loader, model, criterion, ap, global_step, epoch): - model.eval() - epoch_time = 0 - keep_avg = KeepAverage() - c_logger.print_eval_start() - if data_loader is not None: - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - ( - text_input, - text_lengths, - mel_input, - mel_lengths, - linear_input, - stop_targets, - speaker_ids, - speaker_embeddings, - _, - _, - ) = format_data(data) - assert mel_input.shape[1] % model.decoder.r == 0 - - # forward pass model - if config.bidirectional_decoder or config.double_decoder_consistency: - ( - decoder_output, - postnet_output, - alignments, - stop_tokens, - decoder_backward_output, - alignments_backward, - ) = model( - text_input, text_lengths, mel_input, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings - ) - else: - decoder_output, postnet_output, alignments, stop_tokens = model( - text_input, text_lengths, mel_input, speaker_ids=speaker_ids, speaker_embeddings=speaker_embeddings - ) - decoder_backward_output = None - alignments_backward = None - - # set the alignment lengths wrt reduction factor for guided attention - if mel_lengths.max() % model.decoder.r != 0: - alignment_lengths = ( - mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r)) - ) // model.decoder.r - else: - alignment_lengths = mel_lengths // model.decoder.r - - # compute loss - loss_dict = criterion( - postnet_output, - decoder_output, - mel_input, - linear_input, - stop_tokens, - stop_targets, - mel_lengths, - decoder_backward_output, - alignments, - alignment_lengths, - alignments_backward, - text_lengths, - ) - - # step time - step_time = time.time() - start_time - epoch_time += step_time - - # compute alignment score - align_error = 1 - alignment_diagonal_score(alignments) - loss_dict["align_error"] = align_error - - # aggregate losses from processes - if num_gpus > 1: - loss_dict["postnet_loss"] = reduce_tensor(loss_dict["postnet_loss"].data, num_gpus) - loss_dict["decoder_loss"] = reduce_tensor(loss_dict["decoder_loss"].data, num_gpus) - if config.stopnet: - loss_dict["stopnet_loss"] = reduce_tensor(loss_dict["stopnet_loss"].data, num_gpus) - - # detach loss values - loss_dict_new = dict() - for key, value in loss_dict.items(): - if isinstance(value, (int, float)): - loss_dict_new[key] = value - else: - loss_dict_new[key] = value.item() - loss_dict = loss_dict_new - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - keep_avg.update_values(update_train_values) - - if config.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # Diagnostic visualizations - idx = np.random.randint(mel_input.shape[0]) - const_spec = postnet_output[idx].data.cpu().numpy() - gt_spec = ( - linear_input[idx].data.cpu().numpy() - if config.model in ["Tacotron", "TacotronGST"] - else mel_input[idx].data.cpu().numpy() - ) - align_img = alignments[idx].data.cpu().numpy() - - eval_figures = { - "prediction": plot_spectrogram(const_spec, ap, output_fig=False), - "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), - "alignment": plot_alignment(align_img, output_fig=False), - } - - # Sample audio - if config.model.lower() in ["tacotron"]: - eval_audio = ap.inv_spectrogram(const_spec.T) - else: - eval_audio = ap.inv_melspectrogram(const_spec.T) - tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, config.audio["sample_rate"]) - - # Plot Validation Stats - - if config.bidirectional_decoder or config.double_decoder_consistency: - align_b_img = alignments_backward[idx].data.cpu().numpy() - eval_figures["alignment2"] = plot_alignment(align_b_img, output_fig=False) - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - tb_logger.tb_eval_figures(global_step, eval_figures) - - if args.rank == 0 and epoch > config.test_delay_epochs: - if config.test_sentences_file: - with open(config.test_sentences_file, "r") as f: - test_sentences = [s.strip() for s in f.readlines()] - else: - test_sentences = [ - "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", - "Be a voice, not an echo.", - "I'm sorry Dave. I'm afraid I can't do that.", - "This cake is great. It's so delicious and moist.", - "Prior to November 22, 1963.", - ] - - # test sentences - test_audios = {} - test_figures = {} - print(" | > Synthesizing test sentences") - speaker_id = 0 if config.use_speaker_embedding else None - speaker_embedding = ( - speaker_mapping[list(speaker_mapping.keys())[randrange(len(speaker_mapping) - 1)]]["embedding"] - if config.use_external_speaker_embedding_file and config.use_speaker_embedding - else None - ) - style_wav = config.gst_style_input - if style_wav is None and config.gst is not None: - # inicialize GST with zero dict. - style_wav = {} - print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") - for i in range(config.gst["gst_num_style_tokens"]): - style_wav[str(i)] = 0 - for idx, test_sentence in enumerate(test_sentences): - try: - wav, alignment, decoder_output, postnet_output, stop_tokens, _ = synthesis( - model, - test_sentence, - config, - use_cuda, - ap, - speaker_id=speaker_id, - speaker_embedding=speaker_embedding, - style_wav=style_wav, - truncated=False, - enable_eos_bos_chars=config.enable_eos_bos_chars, # pylint: disable=unused-argument - use_griffin_lim=True, - do_trim_silence=False, - ) - - file_path = os.path.join(AUDIO_PATH, str(global_step)) - os.makedirs(file_path, exist_ok=True) - file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) - ap.save_wav(wav, file_path) - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-prediction".format(idx)] = plot_spectrogram(postnet_output, ap, output_fig=False) - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment, output_fig=False) - except: # pylint: disable=bare-except - print(" !! Error creating Test Sentence -", idx) - traceback.print_exc() - tb_logger.tb_test_audios(global_step, test_audios, config.audio["sample_rate"]) - tb_logger.tb_test_figures(global_step, test_figures) - return keep_avg.avg_values - - -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global meta_data_train, meta_data_eval, speaker_mapping, symbols, phonemes, model_characters - # Audio processor - ap = AudioProcessor(**config.audio.to_dict()) - - # setup custom characters if set in config file. - if config.characters is not None: - symbols, phonemes = make_symbols(**config.characters.to_dict()) - - # DISTRUBUTED - if num_gpus > 1: - init_distributed(args.rank, num_gpus, args.group_id, config.distributed["backend"], config.distributed["url"]) - num_chars = len(phonemes) if config.use_phonemes else len(symbols) - model_characters = phonemes if config.use_phonemes else symbols - - # load data instances - meta_data_train, meta_data_eval = load_meta_data(config.datasets) - - # set the portion of the data used for training - if config.has("train_portion"): - meta_data_train = meta_data_train[: int(len(meta_data_train) * config.train_portion)] - if config.has("eval_portion"): - meta_data_eval = meta_data_eval[: int(len(meta_data_eval) * config.eval_portion)] - - # parse speakers - num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(config, args, meta_data_train, OUT_PATH) - - model = setup_model(num_chars, num_speakers, config, speaker_embedding_dim) - - # scalers for mixed precision training - scaler = torch.cuda.amp.GradScaler() if config.mixed_precision else None - scaler_st = torch.cuda.amp.GradScaler() if config.mixed_precision and config.separate_stopnet else None - - params = set_weight_decay(model, config.wd) - optimizer = RAdam(params, lr=config.lr, weight_decay=0) - if config.stopnet and config.separate_stopnet: - optimizer_st = RAdam(model.decoder.stopnet.parameters(), lr=config.lr, weight_decay=0) - else: - optimizer_st = None - - # setup criterion - criterion = TacotronLoss(config, stopnet_pos_weight=config.stopnet_pos_weight, ga_sigma=0.4) - if args.restore_path: - print(f" > Restoring from {os.path.basename(args.restore_path)}...") - checkpoint = torch.load(args.restore_path, map_location="cpu") - try: - print(" > Restoring Model...") - model.load_state_dict(checkpoint["model"]) - # optimizer restore - print(" > Restoring Optimizer...") - optimizer.load_state_dict(checkpoint["optimizer"]) - if "scaler" in checkpoint and config.mixed_precision: - print(" > Restoring AMP Scaler...") - scaler.load_state_dict(checkpoint["scaler"]) - except (KeyError, RuntimeError): - print(" > Partial model initialization...") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], config) - model.load_state_dict(model_dict) - del model_dict - - for group in optimizer.param_groups: - group["lr"] = config.lr - print(" > Model restored from step %d" % checkpoint["step"], flush=True) - args.restore_step = checkpoint["step"] - else: - args.restore_step = 0 - - if use_cuda: - model.cuda() - criterion.cuda() - - # DISTRUBUTED - if num_gpus > 1: - model = apply_gradient_allreduce(model) - - if config.noam_schedule: - scheduler = NoamLR(optimizer, warmup_steps=config.warmup_steps, last_epoch=args.restore_step - 1) - else: - scheduler = None - - num_params = count_parameters(model) - print("\n > Model has {} parameters".format(num_params), flush=True) - - if args.restore_step == 0 or not args.best_path: - best_loss = float("inf") - print(" > Starting with inf best loss.") - else: - print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {best_loss}.") - keep_all_best = config.keep_all_best - keep_after = config.keep_after # void if keep_all_best False - - # define data loaders - train_loader = setup_loader(ap, model.decoder.r, is_val=False, verbose=True) - eval_loader = setup_loader(ap, model.decoder.r, is_val=True) - - global_step = args.restore_step - for epoch in range(0, config.epochs): - c_logger.print_epoch_start(epoch, config.epochs) - # set gradual training - if config.gradual_training is not None: - r, config.batch_size = gradual_training_scheduler(global_step, config) - config.r = r - model.decoder.set_r(r) - if config.bidirectional_decoder: - model.decoder_backward.set_r(r) - train_loader.dataset.outputs_per_step = r - eval_loader.dataset.outputs_per_step = r - train_loader = setup_loader(ap, model.decoder.r, is_val=False, dataset=train_loader.dataset) - eval_loader = setup_loader(ap, model.decoder.r, is_val=True, dataset=eval_loader.dataset) - print("\n > Number of output frames:", model.decoder.r) - # train one epoch - train_avg_loss_dict, global_step = train( - train_loader, - model, - criterion, - optimizer, - optimizer_st, - scheduler, - ap, - global_step, - epoch, - scaler, - scaler_st, - ) - # eval one epoch - eval_avg_loss_dict = evaluate(eval_loader, model, criterion, ap, global_step, epoch) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = train_avg_loss_dict["avg_postnet_loss"] - if config.run_eval: - target_loss = eval_avg_loss_dict["avg_postnet_loss"] - best_loss = save_best_model( - target_loss, - best_loss, - model, - optimizer, - global_step, - epoch, - config.r, - OUT_PATH, - model_characters, - keep_all_best=keep_all_best, - keep_after=keep_after, - scaler=scaler.state_dict() if config.mixed_precision else None, - ) - - -if __name__ == "__main__": - args, config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py new file mode 100644 index 00000000..c491700d --- /dev/null +++ b/TTS/bin/train_tts.py @@ -0,0 +1,14 @@ +import sys + +from TTS.trainer import Trainer, init_training + + +def main(): + """Run 🐸TTS trainer from terminal. This is also necessary to run DDP training by ```distribute.py```""" + args, config, output_path, _, c_logger, tb_logger = init_training(sys.argv) + trainer = Trainer(args, config, output_path, c_logger, tb_logger, cudnn_benchmark=False) + trainer.fit() + + +if __name__ == "__main__": + main() diff --git a/TTS/bin/train_vocoder.py b/TTS/bin/train_vocoder.py new file mode 100644 index 00000000..868aae2e --- /dev/null +++ b/TTS/bin/train_vocoder.py @@ -0,0 +1,27 @@ +import os +import sys +import traceback + +from TTS.trainer import Trainer, init_training +from TTS.utils.generic_utils import remove_experiment_folder + + +def main(): + try: + args, config, output_path, _, c_logger, tb_logger = init_training(sys.argv) + trainer = Trainer(args, config, output_path, c_logger, tb_logger) + trainer.fit() + except KeyboardInterrupt: + remove_experiment_folder(output_path) + try: + sys.exit(0) + except SystemExit: + os._exit(0) # pylint: disable=protected-access + except Exception: # pylint: disable=broad-except + remove_experiment_folder(output_path) + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/TTS/bin/train_vocoder_gan.py b/TTS/bin/train_vocoder_gan.py deleted file mode 100755 index 123d5a43..00000000 --- a/TTS/bin/train_vocoder_gan.py +++ /dev/null @@ -1,638 +0,0 @@ -#!/usr/bin/env python3 -# TODO: mixed precision training -"""Trains GAN based vocoder model.""" - -import itertools -import os -import sys -import time -import traceback -from inspect import signature - -import torch - -# DISTRIBUTED -from torch.nn.parallel import DistributedDataParallel as DDP_th -from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler - -from TTS.utils.arguments import init_training -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import init_distributed -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.training import setup_torch_training_env -from TTS.vocoder.datasets.gan_dataset import GANDataset -from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data -from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss -from TTS.vocoder.utils.generic_utils import plot_results, setup_discriminator, setup_generator -from TTS.vocoder.utils.io import save_best_model, save_checkpoint - -use_cuda, num_gpus = setup_torch_training_env(True, True) - - -def setup_loader(ap, is_val=False, verbose=False): - loader = None - if not is_val or c.run_eval: - dataset = GANDataset( - ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad_short=c.pad_short, - conv_pad=c.conv_pad, - return_pairs=c.diff_samples_for_G_and_D if "diff_samples_for_G_and_D" in c else False, - is_training=not is_val, - return_segments=not is_val, - use_noise_augment=c.use_noise_augment, - use_cache=c.use_cache, - verbose=verbose, - ) - dataset.shuffle_mapping() - sampler = DistributedSampler(dataset, shuffle=True) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=1 if is_val else c.batch_size, - shuffle=num_gpus == 0, - drop_last=False, - sampler=sampler, - num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers, - pin_memory=False, - ) - return loader - - -def format_data(data): - if isinstance(data[0], list): - x_G, y_G = data[0] - x_D, y_D = data[1] - if use_cuda: - x_G = x_G.cuda(non_blocking=True) - y_G = y_G.cuda(non_blocking=True) - x_D = x_D.cuda(non_blocking=True) - y_D = y_D.cuda(non_blocking=True) - return x_G, y_G, x_D, y_D - x, y = data - if use_cuda: - x = x.cuda(non_blocking=True) - y = y.cuda(non_blocking=True) - return x, y, None, None - - -def train( - model_G, - criterion_G, - optimizer_G, - model_D, - criterion_D, - optimizer_D, - scheduler_G, - scheduler_D, - ap, - global_step, - epoch, -): - data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) - model_G.train() - model_D.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / c.batch_size) - end_time = time.time() - c_logger.print_train_start() - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - c_G, y_G, c_D, y_D = format_data(data) - loader_time = time.time() - end_time - - global_step += 1 - - ############################## - # GENERATOR - ############################## - - # generator pass - y_hat = model_G(c_G) - y_hat_sub = None - y_G_sub = None - y_hat_vis = y_hat # for visualization - - # PQMF formatting - if y_hat.shape[1] > 1: - y_hat_sub = y_hat - y_hat = model_G.pqmf_synthesis(y_hat) - y_hat_vis = y_hat - y_G_sub = model_G.pqmf_analysis(y_G) - - scores_fake, feats_fake, feats_real = None, None, None - if global_step > c.steps_to_start_discriminator: - - # run D with or without cond. features - if len(signature(model_D.forward).parameters) == 2: - D_out_fake = model_D(y_hat, c_G) - else: - D_out_fake = model_D(y_hat) - D_out_real = None - - if c.use_feat_match_loss: - with torch.no_grad(): - D_out_real = model_D(y_G) - - # format D outputs - if isinstance(D_out_fake, tuple): - scores_fake, feats_fake = D_out_fake - if D_out_real is None: - feats_real = None - else: - # we don't need scores for real samples for training G since they are always 1 - _, feats_real = D_out_real - else: - scores_fake = D_out_fake - - # compute losses - loss_G_dict = criterion_G( - y_hat=y_hat, - y=y_G, - scores_fake=scores_fake, - feats_fake=feats_fake, - feats_real=feats_real, - y_hat_sub=y_hat_sub, - y_sub=y_G_sub, - ) - loss_G = loss_G_dict["G_loss"] - - # optimizer generator - optimizer_G.zero_grad() - loss_G.backward() - if c.gen_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_G.parameters(), c.gen_clip_grad) - optimizer_G.step() - - loss_dict = dict() - for key, value in loss_G_dict.items(): - if isinstance(value, int): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - ############################## - # DISCRIMINATOR - ############################## - if global_step >= c.steps_to_start_discriminator: - # discriminator pass - if c.diff_samples_for_G_and_D: - # use a different sample than generator - with torch.no_grad(): - y_hat = model_G(c_D) - - # PQMF formatting - if y_hat.shape[1] > 1: - y_hat = model_G.pqmf_synthesis(y_hat) - else: - # use the same samples as generator - c_D = c_G.clone() - y_D = y_G.clone() - - # run D with or without cond. features - if len(signature(model_D.forward).parameters) == 2: - D_out_fake = model_D(y_hat.detach().clone(), c_D) - D_out_real = model_D(y_D, c_D) - else: - D_out_fake = model_D(y_hat.detach()) - D_out_real = model_D(y_D) - - # format D outputs - if isinstance(D_out_fake, tuple): - # model_D returns scores and features - scores_fake, feats_fake = D_out_fake - if D_out_real is None: - scores_real, feats_real = None, None - else: - scores_real, feats_real = D_out_real - else: - # model D returns only scores - scores_fake = D_out_fake - scores_real = D_out_real - - # compute losses - loss_D_dict = criterion_D(scores_fake, scores_real) - loss_D = loss_D_dict["D_loss"] - - # optimizer discriminator - optimizer_D.zero_grad() - loss_D.backward() - if c.disc_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_D.parameters(), c.disc_clip_grad) - optimizer_D.step() - - for key, value in loss_D_dict.items(): - if isinstance(value, (int, float)): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - step_time = time.time() - start_time - epoch_time += step_time - - # get current learning rates - current_lr_G = list(optimizer_G.param_groups)[0]["lr"] - current_lr_D = list(optimizer_D.param_groups)[0]["lr"] - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - keep_avg.update_values(update_train_values) - - # print training stats - if global_step % c.print_step == 0: - log_dict = { - "step_time": [step_time, 2], - "loader_time": [loader_time, 4], - "current_lr_G": current_lr_G, - "current_lr_D": current_lr_D, - } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # plot step stats - if global_step % 10 == 0: - iter_stats = {"lr_G": current_lr_G, "lr_D": current_lr_D, "step_time": step_time} - iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) - - # save checkpoint - if global_step % c.save_step == 0: - if c.checkpoint: - # save model - save_checkpoint( - model_G, - optimizer_G, - scheduler_G, - model_D, - optimizer_D, - scheduler_D, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict, - ) - - # compute spectrograms - figures = plot_results(y_hat_vis, y_G, ap, global_step, "train") - tb_logger.tb_train_figures(global_step, figures) - - # Sample audio - sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_train_audios(global_step, {"train/audio": sample_voice}, c.audio["sample_rate"]) - end_time = time.time() - - if scheduler_G is not None: - scheduler_G.step() - - if scheduler_D is not None: - scheduler_D.step() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Training Epoch Stats - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - if args.rank == 0: - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - # TODO: plot model stats - # if c.tb_model_param_stats: - # tb_logger.tb_model_weights(model, global_step) - torch.cuda.empty_cache() - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch): - data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0)) - model_G.eval() - model_D.eval() - epoch_time = 0 - keep_avg = KeepAverage() - end_time = time.time() - c_logger.print_eval_start() - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - c_G, y_G, _, _ = format_data(data) - loader_time = time.time() - end_time - - global_step += 1 - - ############################## - # GENERATOR - ############################## - - # generator pass - y_hat = model_G(c_G)[:, :, : y_G.size(2)] - y_hat_sub = None - y_G_sub = None - - # PQMF formatting - if y_hat.shape[1] > 1: - y_hat_sub = y_hat - y_hat = model_G.pqmf_synthesis(y_hat) - y_G_sub = model_G.pqmf_analysis(y_G) - - scores_fake, feats_fake, feats_real = None, None, None - if global_step > c.steps_to_start_discriminator: - - if len(signature(model_D.forward).parameters) == 2: - D_out_fake = model_D(y_hat, c_G) - else: - D_out_fake = model_D(y_hat) - D_out_real = None - - if c.use_feat_match_loss: - with torch.no_grad(): - D_out_real = model_D(y_G) - - # format D outputs - if isinstance(D_out_fake, tuple): - scores_fake, feats_fake = D_out_fake - if D_out_real is None: - feats_real = None - else: - _, feats_real = D_out_real - else: - scores_fake = D_out_fake - feats_fake, feats_real = None, None - - # compute losses - loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub) - - loss_dict = dict() - for key, value in loss_G_dict.items(): - if isinstance(value, (int, float)): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - ############################## - # DISCRIMINATOR - ############################## - - if global_step >= c.steps_to_start_discriminator: - # discriminator pass - with torch.no_grad(): - y_hat = model_G(c_G)[:, :, : y_G.size(2)] - - # PQMF formatting - if y_hat.shape[1] > 1: - y_hat = model_G.pqmf_synthesis(y_hat) - - # run D with or without cond. features - if len(signature(model_D.forward).parameters) == 2: - D_out_fake = model_D(y_hat.detach(), c_G) - D_out_real = model_D(y_G, c_G) - else: - D_out_fake = model_D(y_hat.detach()) - D_out_real = model_D(y_G) - - # format D outputs - if isinstance(D_out_fake, tuple): - scores_fake, feats_fake = D_out_fake - if D_out_real is None: - scores_real, feats_real = None, None - else: - scores_real, feats_real = D_out_real - else: - scores_fake = D_out_fake - scores_real = D_out_real - - # compute losses - loss_D_dict = criterion_D(scores_fake, scores_real) - - for key, value in loss_D_dict.items(): - if isinstance(value, (int, float)): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - step_time = time.time() - start_time - epoch_time += step_time - - # update avg stats - update_eval_values = dict() - for key, value in loss_dict.items(): - update_eval_values["avg_" + key] = value - update_eval_values["avg_loader_time"] = loader_time - update_eval_values["avg_step_time"] = step_time - keep_avg.update_values(update_eval_values) - - # print eval stats - if c.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # compute spectrograms - figures = plot_results(y_hat, y_G, ap, global_step, "eval") - tb_logger.tb_eval_figures(global_step, figures) - - # Sample audio - predict_waveform = y_hat[0].squeeze(0).detach().cpu().numpy() - real_waveform = y_G[0].squeeze(0).cpu().numpy() - tb_logger.tb_eval_audios( - global_step, {"eval/audio": predict_waveform, "eval/real_waveformo": real_waveform}, c.audio["sample_rate"] - ) - - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - - # synthesize a full voice - data_loader.return_segments = False - torch.cuda.empty_cache() - return keep_avg.avg_values - - -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global train_data, eval_data - print(f" > Loading wavs from: {c.data_path}") - if c.feature_path is not None: - print(f" > Loading features from: {c.feature_path}") - eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) - else: - eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) - - # setup audio processor - ap = AudioProcessor(**c.audio.to_dict()) - - # DISTRUBUTED - if num_gpus > 1: - init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) - - # setup models - model_gen = setup_generator(c) - model_disc = setup_discriminator(c) - - # setup criterion - criterion_gen = GeneratorLoss(c) - criterion_disc = DiscriminatorLoss(c) - - if use_cuda: - model_gen.cuda() - criterion_gen.cuda() - model_disc.cuda() - criterion_disc.cuda() - - # setup optimizers - # TODO: allow loading custom optimizers - optimizer_gen = None - optimizer_disc = None - optimizer_gen = getattr(torch.optim, c.optimizer) - optimizer_gen = optimizer_gen(model_gen.parameters(), lr=c.lr_gen, **c.optimizer_params) - optimizer_disc = getattr(torch.optim, c.optimizer) - - if c.discriminator_model == "hifigan_discriminator": - optimizer_disc = optimizer_disc( - itertools.chain(model_disc.msd.parameters(), model_disc.mpd.parameters()), - lr=c.lr_disc, - **c.optimizer_params, - ) - else: - optimizer_disc = optimizer_disc(model_disc.parameters(), lr=c.lr_disc, **c.optimizer_params) - - # schedulers - scheduler_gen = None - scheduler_disc = None - if "lr_scheduler_gen" in c: - scheduler_gen = getattr(torch.optim.lr_scheduler, c.lr_scheduler_gen) - scheduler_gen = scheduler_gen(optimizer_gen, **c.lr_scheduler_gen_params) - if "lr_scheduler_disc" in c: - scheduler_disc = getattr(torch.optim.lr_scheduler, c.lr_scheduler_disc) - scheduler_disc = scheduler_disc(optimizer_disc, **c.lr_scheduler_disc_params) - - if args.restore_path: - print(f" > Restoring from {os.path.basename(args.restore_path)}...") - checkpoint = torch.load(args.restore_path, map_location="cpu") - try: - print(" > Restoring Generator Model...") - model_gen.load_state_dict(checkpoint["model"]) - print(" > Restoring Generator Optimizer...") - optimizer_gen.load_state_dict(checkpoint["optimizer"]) - print(" > Restoring Discriminator Model...") - model_disc.load_state_dict(checkpoint["model_disc"]) - print(" > Restoring Discriminator Optimizer...") - optimizer_disc.load_state_dict(checkpoint["optimizer_disc"]) - # restore schedulers if it is a continuing training. - if args.continue_path != "": - if "scheduler" in checkpoint and scheduler_gen is not None: - print(" > Restoring Generator LR Scheduler...") - scheduler_gen.load_state_dict(checkpoint["scheduler"]) - # NOTE: Not sure if necessary - scheduler_gen.optimizer = optimizer_gen - if "scheduler_disc" in checkpoint and scheduler_disc is not None: - print(" > Restoring Discriminator LR Scheduler...") - scheduler_disc.load_state_dict(checkpoint["scheduler_disc"]) - scheduler_disc.optimizer = optimizer_disc - if c.lr_scheduler_disc == "ExponentialLR": - scheduler_disc.last_epoch = checkpoint["epoch"] - except RuntimeError: - # restore only matching layers. - print(" > Partial model initialization...") - model_dict = model_gen.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], c) - model_gen.load_state_dict(model_dict) - - model_dict = model_disc.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model_disc"], c) - model_disc.load_state_dict(model_dict) - del model_dict - - # reset lr if not countinuining training. - if args.continue_path == "": - for group in optimizer_gen.param_groups: - group["lr"] = c.lr_gen - - for group in optimizer_disc.param_groups: - group["lr"] = c.lr_disc - - print(f" > Model restored from step {checkpoint['step']:d}", flush=True) - args.restore_step = checkpoint["step"] - else: - args.restore_step = 0 - - # DISTRUBUTED - if num_gpus > 1: - model_gen = DDP_th(model_gen, device_ids=[args.rank]) - model_disc = DDP_th(model_disc, device_ids=[args.rank]) - - num_params = count_parameters(model_gen) - print(" > Generator has {} parameters".format(num_params), flush=True) - num_params = count_parameters(model_disc) - print(" > Discriminator has {} parameters".format(num_params), flush=True) - - if args.restore_step == 0 or not args.best_path: - best_loss = float("inf") - print(" > Starting with inf best loss.") - else: - print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with best loss of {best_loss}.") - keep_all_best = c.get("keep_all_best", False) - keep_after = c.get("keep_after", 10000) # void if keep_all_best False - - global_step = args.restore_step - for epoch in range(0, c.epochs): - c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train( - model_gen, - criterion_gen, - optimizer_gen, - model_disc, - criterion_disc, - optimizer_disc, - scheduler_gen, - scheduler_disc, - ap, - global_step, - epoch, - ) - eval_avg_loss_dict = evaluate(model_gen, criterion_gen, model_disc, criterion_disc, ap, global_step, epoch) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = eval_avg_loss_dict[c.target_loss] - best_loss = save_best_model( - target_loss, - best_loss, - model_gen, - optimizer_gen, - scheduler_gen, - model_disc, - optimizer_disc, - scheduler_disc, - global_step, - epoch, - OUT_PATH, - keep_all_best=keep_all_best, - keep_after=keep_after, - model_losses=eval_avg_loss_dict, - ) - - -if __name__ == "__main__": - args, c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/TTS/bin/train_vocoder_wavegrad.py b/TTS/bin/train_vocoder_wavegrad.py deleted file mode 100644 index c0fcff51..00000000 --- a/TTS/bin/train_vocoder_wavegrad.py +++ /dev/null @@ -1,431 +0,0 @@ -#!/usr/bin/env python3 -"""Trains WaveGrad vocoder models.""" - -import os -import sys -import time -import traceback - -import numpy as np -import torch - -# DISTRIBUTED -from torch.nn.parallel import DistributedDataParallel as DDP_th -from torch.optim import Adam -from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler - -from TTS.utils.arguments import init_training -from TTS.utils.audio import AudioProcessor -from TTS.utils.distribute import init_distributed -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.training import setup_torch_training_env -from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data -from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset -from TTS.vocoder.utils.generic_utils import plot_results, setup_generator -from TTS.vocoder.utils.io import save_best_model, save_checkpoint - -use_cuda, num_gpus = setup_torch_training_env(True, True) - - -def setup_loader(ap, is_val=False, verbose=False): - if is_val and not c.run_eval: - loader = None - else: - dataset = WaveGradDataset( - ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad_short=c.pad_short, - conv_pad=c.conv_pad, - is_training=not is_val, - return_segments=True, - use_noise_augment=False, - use_cache=c.use_cache, - verbose=verbose, - ) - sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=c.batch_size, - shuffle=num_gpus <= 1, - drop_last=False, - sampler=sampler, - num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers, - pin_memory=False, - ) - - return loader - - -def format_data(data): - # return a whole audio segment - m, x = data - x = x.unsqueeze(1) - if use_cuda: - m = m.cuda(non_blocking=True) - x = x.cuda(non_blocking=True) - return m, x - - -def format_test_data(data): - # return a whole audio segment - m, x = data - m = m[None, ...] - x = x[None, None, ...] - if use_cuda: - m = m.cuda(non_blocking=True) - x = x.cuda(non_blocking=True) - return m, x - - -def train(model, criterion, optimizer, scheduler, scaler, ap, global_step, epoch): - data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) - model.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / c.batch_size) - end_time = time.time() - c_logger.print_train_start() - # setup noise schedule - noise_schedule = c["train_noise_schedule"] - betas = np.linspace(noise_schedule["min_val"], noise_schedule["max_val"], noise_schedule["num_steps"]) - if hasattr(model, "module"): - model.module.compute_noise_level(betas) - else: - model.compute_noise_level(betas) - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - m, x = format_data(data) - loader_time = time.time() - end_time - - global_step += 1 - - with torch.cuda.amp.autocast(enabled=c.mixed_precision): - # compute noisy input - if hasattr(model, "module"): - noise, x_noisy, noise_scale = model.module.compute_y_n(x) - else: - noise, x_noisy, noise_scale = model.compute_y_n(x) - - # forward pass - noise_hat = model(x_noisy, m, noise_scale) - - # compute losses - loss = criterion(noise, noise_hat) - loss_wavegrad_dict = {"wavegrad_loss": loss} - - # check nan loss - if torch.isnan(loss).any(): - raise RuntimeError(f"Detected NaN loss at step {global_step}.") - - optimizer.zero_grad() - - # backward pass with loss scaling - if c.mixed_precision: - scaler.scale(loss).backward() - scaler.unscale_(optimizer) - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), c.grad_clip) - scaler.step(optimizer) - scaler.update() - else: - loss.backward() - grad_norm = torch.nn.utils.grad_clip_norm_(model.parameters(), c.clip_grad) - optimizer.step() - - # schedule update - if scheduler is not None: - scheduler.step() - - # disconnect loss values - loss_dict = dict() - for key, value in loss_wavegrad_dict.items(): - if isinstance(value, int): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - # epoch/step timing - step_time = time.time() - start_time - epoch_time += step_time - - # get current learning rates - current_lr = list(optimizer.param_groups)[0]["lr"] - - # update avg stats - update_train_values = dict() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - keep_avg.update_values(update_train_values) - - # print training stats - if global_step % c.print_step == 0: - log_dict = { - "step_time": [step_time, 2], - "loader_time": [loader_time, 4], - "current_lr": current_lr, - "grad_norm": grad_norm.item(), - } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - # plot step stats - if global_step % 10 == 0: - iter_stats = {"lr": current_lr, "grad_norm": grad_norm.item(), "step_time": step_time} - iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) - - # save checkpoint - if global_step % c.save_step == 0: - if c.checkpoint: - # save model - save_checkpoint( - model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None, - ) - - end_time = time.time() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Training Epoch Stats - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - if args.rank == 0: - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - # TODO: plot model stats - if c.tb_model_param_stats and args.rank == 0: - tb_logger.tb_model_weights(model, global_step) - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(model, criterion, ap, global_step, epoch): - data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0)) - model.eval() - epoch_time = 0 - keep_avg = KeepAverage() - end_time = time.time() - c_logger.print_eval_start() - for num_iter, data in enumerate(data_loader): - start_time = time.time() - - # format data - m, x = format_data(data) - loader_time = time.time() - end_time - - global_step += 1 - - # compute noisy input - if hasattr(model, "module"): - noise, x_noisy, noise_scale = model.module.compute_y_n(x) - else: - noise, x_noisy, noise_scale = model.compute_y_n(x) - - # forward pass - noise_hat = model(x_noisy, m, noise_scale) - - # compute losses - loss = criterion(noise, noise_hat) - loss_wavegrad_dict = {"wavegrad_loss": loss} - - loss_dict = dict() - for key, value in loss_wavegrad_dict.items(): - if isinstance(value, (int, float)): - loss_dict[key] = value - else: - loss_dict[key] = value.item() - - step_time = time.time() - start_time - epoch_time += step_time - - # update avg stats - update_eval_values = dict() - for key, value in loss_dict.items(): - update_eval_values["avg_" + key] = value - update_eval_values["avg_loader_time"] = loader_time - update_eval_values["avg_step_time"] = step_time - keep_avg.update_values(update_eval_values) - - # print eval stats - if c.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if args.rank == 0: - data_loader.dataset.return_segments = False - samples = data_loader.dataset.load_test_samples(1) - m, x = format_test_data(samples[0]) - - # setup noise schedule and inference - noise_schedule = c["test_noise_schedule"] - betas = np.linspace(noise_schedule["min_val"], noise_schedule["max_val"], noise_schedule["num_steps"]) - if hasattr(model, "module"): - model.module.compute_noise_level(betas) - # compute voice - x_pred = model.module.inference(m) - else: - model.compute_noise_level(betas) - # compute voice - x_pred = model.inference(m) - - # compute spectrograms - figures = plot_results(x_pred, x, ap, global_step, "eval") - tb_logger.tb_eval_figures(global_step, figures) - - # Sample audio - sample_voice = x_pred[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_eval_audios(global_step, {"eval/audio": sample_voice}, c.audio["sample_rate"]) - - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - data_loader.dataset.return_segments = True - - return keep_avg.avg_values - - -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global train_data, eval_data - print(f" > Loading wavs from: {c.data_path}") - if c.feature_path is not None: - print(f" > Loading features from: {c.feature_path}") - eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) - else: - eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) - - # setup audio processor - ap = AudioProcessor(**c.audio.to_dict()) - - # DISTRUBUTED - if num_gpus > 1: - init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) - - # setup models - model = setup_generator(c) - - # scaler for mixed_precision - scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None - - # setup optimizers - optimizer = Adam(model.parameters(), lr=c.lr, weight_decay=0) - - # schedulers - scheduler = None - if "lr_scheduler" in c: - scheduler = getattr(torch.optim.lr_scheduler, c.lr_scheduler) - scheduler = scheduler(optimizer, **c.lr_scheduler_params) - - # setup criterion - criterion = torch.nn.L1Loss().cuda() - - if use_cuda: - model.cuda() - criterion.cuda() - - if args.restore_path: - print(f" > Restoring from {os.path.basename(args.restore_path)}...") - checkpoint = torch.load(args.restore_path, map_location="cpu") - try: - print(" > Restoring Model...") - model.load_state_dict(checkpoint["model"]) - print(" > Restoring Optimizer...") - optimizer.load_state_dict(checkpoint["optimizer"]) - if "scheduler" in checkpoint: - print(" > Restoring LR Scheduler...") - scheduler.load_state_dict(checkpoint["scheduler"]) - # NOTE: Not sure if necessary - scheduler.optimizer = optimizer - if "scaler" in checkpoint and c.mixed_precision: - print(" > Restoring AMP Scaler...") - scaler.load_state_dict(checkpoint["scaler"]) - except RuntimeError: - # retore only matching layers. - print(" > Partial model initialization...") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], c) - model.load_state_dict(model_dict) - del model_dict - - # reset lr if not countinuining training. - for group in optimizer.param_groups: - group["lr"] = c.lr - - print(" > Model restored from step %d" % checkpoint["step"], flush=True) - args.restore_step = checkpoint["step"] - else: - args.restore_step = 0 - - # DISTRUBUTED - if num_gpus > 1: - model = DDP_th(model, device_ids=[args.rank]) - - num_params = count_parameters(model) - print(" > WaveGrad has {} parameters".format(num_params), flush=True) - - if args.restore_step == 0 or not args.best_path: - best_loss = float("inf") - print(" > Starting with inf best loss.") - else: - print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {best_loss}.") - keep_all_best = c.get("keep_all_best", False) - keep_after = c.get("keep_after", 10000) # void if keep_all_best False - - global_step = args.restore_step - for epoch in range(0, c.epochs): - c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train(model, criterion, optimizer, scheduler, scaler, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = eval_avg_loss_dict[c.target_loss] - best_loss = save_best_model( - target_loss, - best_loss, - model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - keep_all_best=keep_all_best, - keep_after=keep_after, - model_losses=eval_avg_loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None, - ) - - -if __name__ == "__main__": - args, c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/TTS/bin/train_vocoder_wavernn.py b/TTS/bin/train_vocoder_wavernn.py deleted file mode 100644 index bcad9493..00000000 --- a/TTS/bin/train_vocoder_wavernn.py +++ /dev/null @@ -1,431 +0,0 @@ -#!/usr/bin/env python3 -"""Train WaveRNN vocoder model.""" - -import os -import random -import sys -import time -import traceback - -import torch -from torch.utils.data import DataLoader - -from TTS.tts.utils.visual import plot_spectrogram -from TTS.utils.arguments import init_training -from TTS.utils.audio import AudioProcessor -from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.radam import RAdam -from TTS.utils.training import setup_torch_training_env -from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data -from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset -from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss -from TTS.vocoder.utils.generic_utils import setup_generator -from TTS.vocoder.utils.io import save_best_model, save_checkpoint - -# from torch.utils.data.distributed import DistributedSampler - - -use_cuda, num_gpus = setup_torch_training_env(True, True) - - -def setup_loader(ap, is_val=False, verbose=False): - if is_val and not c.run_eval: - loader = None - else: - dataset = WaveRNNDataset( - ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad=c.padding, - mode=c.mode, - mulaw=c.mulaw, - is_training=not is_val, - verbose=verbose, - ) - # sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - shuffle=True, - collate_fn=dataset.collate, - batch_size=c.batch_size, - num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers, - pin_memory=True, - ) - return loader - - -def format_data(data): - # setup input data - x_input = data[0] - mels = data[1] - y_coarse = data[2] - - # dispatch data to GPU - if use_cuda: - x_input = x_input.cuda(non_blocking=True) - mels = mels.cuda(non_blocking=True) - y_coarse = y_coarse.cuda(non_blocking=True) - - return x_input, mels, y_coarse - - -def train(model, optimizer, criterion, scheduler, scaler, ap, global_step, epoch): - # create train loader - data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) - model.train() - epoch_time = 0 - keep_avg = KeepAverage() - if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) - else: - batch_n_iter = int(len(data_loader.dataset) / c.batch_size) - end_time = time.time() - c_logger.print_train_start() - # train loop - for num_iter, data in enumerate(data_loader): - start_time = time.time() - x_input, mels, y_coarse = format_data(data) - loader_time = time.time() - end_time - global_step += 1 - - optimizer.zero_grad() - - if c.mixed_precision: - # mixed precision training - with torch.cuda.amp.autocast(): - y_hat = model(x_input, mels) - if isinstance(model.mode, int): - y_hat = y_hat.transpose(1, 2).unsqueeze(-1) - else: - y_coarse = y_coarse.float() - y_coarse = y_coarse.unsqueeze(-1) - # compute losses - loss = criterion(y_hat, y_coarse) - scaler.scale(loss).backward() - scaler.unscale_(optimizer) - if c.grad_clip > 0: - torch.nn.utils.clip_grad_norm_(model.parameters(), c.grad_clip) - scaler.step(optimizer) - scaler.update() - else: - # full precision training - y_hat = model(x_input, mels) - if isinstance(model.mode, int): - y_hat = y_hat.transpose(1, 2).unsqueeze(-1) - else: - y_coarse = y_coarse.float() - y_coarse = y_coarse.unsqueeze(-1) - # compute losses - loss = criterion(y_hat, y_coarse) - if loss.item() is None: - raise RuntimeError(" [!] None loss. Exiting ...") - loss.backward() - if c.grad_clip > 0: - torch.nn.utils.clip_grad_norm_(model.parameters(), c.grad_clip) - optimizer.step() - - if scheduler is not None: - scheduler.step() - - # get the current learning rate - cur_lr = list(optimizer.param_groups)[0]["lr"] - - step_time = time.time() - start_time - epoch_time += step_time - - update_train_values = dict() - loss_dict = dict() - loss_dict["model_loss"] = loss.item() - for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time - keep_avg.update_values(update_train_values) - - # print training stats - if global_step % c.print_step == 0: - log_dict = { - "step_time": [step_time, 2], - "loader_time": [loader_time, 4], - "current_lr": cur_lr, - } - c_logger.print_train_step( - batch_n_iter, - num_iter, - global_step, - log_dict, - loss_dict, - keep_avg.avg_values, - ) - - # plot step stats - if global_step % 10 == 0: - iter_stats = {"lr": cur_lr, "step_time": step_time} - iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) - - # save checkpoint - if global_step % c.save_step == 0: - if c.checkpoint: - # save model - save_checkpoint( - model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None, - ) - - # synthesize a full voice - rand_idx = random.randrange(0, len(train_data)) - wav_path = ( - train_data[rand_idx] if not isinstance(train_data[rand_idx], (tuple, list)) else train_data[rand_idx][0] - ) - wav = ap.load_wav(wav_path) - ground_mel = ap.melspectrogram(wav) - ground_mel = torch.FloatTensor(ground_mel) - if use_cuda: - ground_mel = ground_mel.cuda(non_blocking=True) - sample_wav = model.inference( - ground_mel, - c.batched, - c.target_samples, - c.overlap_samples, - ) - predict_mel = ap.melspectrogram(sample_wav) - - # compute spectrograms - figures = { - "train/ground_truth": plot_spectrogram(ground_mel.T), - "train/prediction": plot_spectrogram(predict_mel.T), - } - tb_logger.tb_train_figures(global_step, figures) - - # Sample audio - tb_logger.tb_train_audios(global_step, {"train/audio": sample_wav}, c.audio["sample_rate"]) - end_time = time.time() - - # print epoch stats - c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) - - # Plot Training Epoch Stats - epoch_stats = {"epoch_time": epoch_time} - epoch_stats.update(keep_avg.avg_values) - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) - # TODO: plot model stats - # if c.tb_model_param_stats: - # tb_logger.tb_model_weights(model, global_step) - return keep_avg.avg_values, global_step - - -@torch.no_grad() -def evaluate(model, criterion, ap, global_step, epoch): - # create train loader - data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0)) - model.eval() - epoch_time = 0 - keep_avg = KeepAverage() - end_time = time.time() - c_logger.print_eval_start() - with torch.no_grad(): - for num_iter, data in enumerate(data_loader): - start_time = time.time() - # format data - x_input, mels, y_coarse = format_data(data) - loader_time = time.time() - end_time - global_step += 1 - - y_hat = model(x_input, mels) - if isinstance(model.mode, int): - y_hat = y_hat.transpose(1, 2).unsqueeze(-1) - else: - y_coarse = y_coarse.float() - y_coarse = y_coarse.unsqueeze(-1) - loss = criterion(y_hat, y_coarse) - # Compute avg loss - # if num_gpus > 1: - # loss = reduce_tensor(loss.data, num_gpus) - loss_dict = dict() - loss_dict["model_loss"] = loss.item() - - step_time = time.time() - start_time - epoch_time += step_time - - # update avg stats - update_eval_values = dict() - for key, value in loss_dict.items(): - update_eval_values["avg_" + key] = value - update_eval_values["avg_loader_time"] = loader_time - update_eval_values["avg_step_time"] = step_time - keep_avg.update_values(update_eval_values) - - # print eval stats - if c.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - - if epoch % c.test_every_epochs == 0 and epoch != 0: - # synthesize a full voice - rand_idx = random.randrange(0, len(eval_data)) - wav_path = eval_data[rand_idx] if not isinstance(eval_data[rand_idx], (tuple, list)) else eval_data[rand_idx][0] - wav = ap.load_wav(wav_path) - ground_mel = ap.melspectrogram(wav) - ground_mel = torch.FloatTensor(ground_mel) - if use_cuda: - ground_mel = ground_mel.cuda(non_blocking=True) - sample_wav = model.inference( - ground_mel, - c.batched, - c.target_samples, - c.overlap_samples, - ) - predict_mel = ap.melspectrogram(sample_wav) - - # Sample audio - tb_logger.tb_eval_audios(global_step, {"eval/audio": sample_wav}, c.audio["sample_rate"]) - - # compute spectrograms - figures = { - "eval/ground_truth": plot_spectrogram(ground_mel.T), - "eval/prediction": plot_spectrogram(predict_mel.T), - } - tb_logger.tb_eval_figures(global_step, figures) - - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - return keep_avg.avg_values - - -# FIXME: move args definition/parsing inside of main? -def main(args): # pylint: disable=redefined-outer-name - # pylint: disable=global-variable-undefined - global train_data, eval_data - - # setup audio processor - ap = AudioProcessor(**c.audio.to_dict()) - - print(f" > Loading wavs from: {c.data_path}") - if c.feature_path is not None: - print(f" > Loading features from: {c.feature_path}") - eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) - else: - eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) - # setup model - model_wavernn = setup_generator(c) - - # setup amp scaler - scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None - - # define train functions - if c.mode == "mold": - criterion = discretized_mix_logistic_loss - elif c.mode == "gauss": - criterion = gaussian_loss - elif isinstance(c.mode, int): - criterion = torch.nn.CrossEntropyLoss() - - if use_cuda: - model_wavernn.cuda() - if isinstance(c.mode, int): - criterion.cuda() - - optimizer = RAdam(model_wavernn.parameters(), lr=c.lr, weight_decay=0) - - scheduler = None - if "lr_scheduler" in c: - scheduler = getattr(torch.optim.lr_scheduler, c.lr_scheduler) - scheduler = scheduler(optimizer, **c.lr_scheduler_params) - # slow start for the first 5 epochs - # lr_lambda = lambda epoch: min(epoch / c.warmup_steps, 1) - # scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) - - # restore any checkpoint - if args.restore_path: - print(f" > Restoring from {os.path.basename(args.restore_path)}...") - checkpoint = torch.load(args.restore_path, map_location="cpu") - try: - print(" > Restoring Model...") - model_wavernn.load_state_dict(checkpoint["model"]) - print(" > Restoring Optimizer...") - optimizer.load_state_dict(checkpoint["optimizer"]) - if "scheduler" in checkpoint: - print(" > Restoring Generator LR Scheduler...") - scheduler.load_state_dict(checkpoint["scheduler"]) - scheduler.optimizer = optimizer - if "scaler" in checkpoint and c.mixed_precision: - print(" > Restoring AMP Scaler...") - scaler.load_state_dict(checkpoint["scaler"]) - except RuntimeError: - # retore only matching layers. - print(" > Partial model initialization...") - model_dict = model_wavernn.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], c) - model_wavernn.load_state_dict(model_dict) - - print(" > Model restored from step %d" % checkpoint["step"], flush=True) - args.restore_step = checkpoint["step"] - else: - args.restore_step = 0 - - # DISTRIBUTED - # if num_gpus > 1: - # model = apply_gradient_allreduce(model) - - num_parameters = count_parameters(model_wavernn) - print(" > Model has {} parameters".format(num_parameters), flush=True) - - if args.restore_step == 0 or not args.best_path: - best_loss = float("inf") - print(" > Starting with inf best loss.") - else: - print(" > Restoring best loss from " f"{os.path.basename(args.best_path)} ...") - best_loss = torch.load(args.best_path, map_location="cpu")["model_loss"] - print(f" > Starting with loaded last best loss {best_loss}.") - keep_all_best = c.get("keep_all_best", False) - keep_after = c.get("keep_after", 10000) # void if keep_all_best False - - global_step = args.restore_step - for epoch in range(0, c.epochs): - c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train(model_wavernn, optimizer, criterion, scheduler, scaler, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(model_wavernn, criterion, ap, global_step, epoch) - c_logger.print_epoch_end(epoch, eval_avg_loss_dict) - target_loss = eval_avg_loss_dict["avg_model_loss"] - best_loss = save_best_model( - target_loss, - best_loss, - model_wavernn, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - keep_all_best=keep_all_best, - keep_after=keep_after, - model_losses=eval_avg_loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None, - ) - - -if __name__ == "__main__": - args, c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) - try: - main(args) - except KeyboardInterrupt: - remove_experiment_folder(OUT_PATH) - try: - sys.exit(0) - except SystemExit: - os._exit(0) # pylint: disable=protected-access - except Exception: # pylint: disable=broad-except - remove_experiment_folder(OUT_PATH) - traceback.print_exc() - sys.exit(1) diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py index b4f1cbea..ecbe1f9a 100644 --- a/TTS/config/__init__.py +++ b/TTS/config/__init__.py @@ -1,8 +1,10 @@ import json import os import re +from typing import Dict import yaml +from coqpit import Coqpit from TTS.config.shared_configs import * from TTS.utils.generic_utils import find_module @@ -20,7 +22,18 @@ def read_json_with_comments(json_path): return data -def _search_configs(model_name): +def register_config(model_name: str) -> Coqpit: + """Find the right config for the given model name. + + Args: + model_name (str): Model name. + + Raises: + ModuleNotFoundError: No matching config for the model name. + + Returns: + Coqpit: config class. + """ config_class = None paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.speaker_encoder"] for path in paths: @@ -33,7 +46,15 @@ def _search_configs(model_name): return config_class -def _process_model_name(config_dict): +def _process_model_name(config_dict: Dict) -> str: + """Format the model name as expected. It is a band-aid for the old `vocoder` model names. + + Args: + config_dict (Dict): A dictionary including the config fields. + + Returns: + str: Formatted modelname. + """ model_name = config_dict["model"] if "model" in config_dict else config_dict["generator_model"] model_name = model_name.replace("_generator", "").replace("_discriminator", "") return model_name @@ -69,7 +90,7 @@ def load_config(config_path: str) -> None: raise TypeError(f" [!] Unknown config file type {ext}") config_dict.update(data) model_name = _process_model_name(config_dict) - config_class = _search_configs(model_name.lower()) + config_class = register_config(model_name.lower()) config = config_class() config.from_dict(config_dict) return config diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index a7976db7..669437b9 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -1,7 +1,7 @@ from dataclasses import asdict, dataclass from typing import List -from coqpit import MISSING, Coqpit, check_argument +from coqpit import Coqpit, check_argument @dataclass @@ -180,6 +180,14 @@ class BaseTrainingConfig(Coqpit): among all the models. Args: + model (str): + Name of the model that is used in the training. + run_name (str): + Name of the experiment. This prefixes the output folder name. + run_description (str): + Short description of the experiment. + epochs (int): + Number training epochs. Defaults to 10000. batch_size (int): Training batch size. eval_batch_size (int): @@ -214,7 +222,7 @@ class BaseTrainingConfig(Coqpit): to 10000. num_loader_workers (int): Number of workers for training time dataloader. - num_val_loader_workers (int): + num_eval_loader_workers (int): Number of workers for evaluation time dataloader. output_path (str): Path for training output folder. The nonexist part of the given path is created automatically. @@ -243,8 +251,8 @@ class BaseTrainingConfig(Coqpit): keep_all_best: bool = False keep_after: int = 10000 # dataloading - num_loader_workers: int = MISSING - num_val_loader_workers: int = 0 + num_loader_workers: int = None + num_eval_loader_workers: int = 0 use_noise_augment: bool = False # paths output_path: str = None diff --git a/TTS/model.py b/TTS/model.py new file mode 100644 index 00000000..aefb925e --- /dev/null +++ b/TTS/model.py @@ -0,0 +1,147 @@ +from abc import ABC, abstractmethod +from typing import Dict, List, Tuple, Union + +import numpy as np +import torch +from coqpit import Coqpit +from torch import nn + +from TTS.utils.audio import AudioProcessor + +# pylint: skip-file + + +class BaseModel(nn.Module, ABC): + """Abstract 🐸TTS class. Every new 🐸TTS model must inherit this. + + Notes on input/output tensor shapes: + Any input or output tensor of the model must be shaped as + + - 3D tensors `batch x time x channels` + - 2D tensors `batch x channels` + - 1D tensors `batch x 1` + """ + + @abstractmethod + def forward(self, text: torch.Tensor, aux_input={}, **kwargs) -> Dict: + """Forward pass for the model mainly used in training. + + You can be flexible here and use different number of arguments and argument names since it is mostly used by + `train_step()` in training whitout exposing it to the out of the class. + + Args: + text (torch.Tensor): Input text character sequence ids. + aux_input (Dict): Auxiliary model inputs like embeddings, durations or any other sorts of inputs. + for the model. + + Returns: + Dict: model outputs. This must include an item keyed `model_outputs` as the final artifact of the model. + """ + outputs_dict = {"model_outputs": None} + ... + return outputs_dict + + @abstractmethod + def inference(self, text: torch.Tensor, aux_input={}) -> Dict: + """Forward pass for inference. + + After the model is trained this is the only function that connects the model the out world. + + This function must only take a `text` input and a dictionary that has all the other model specific inputs. + We don't use `*kwargs` since it is problematic with the TorchScript API. + + Args: + text (torch.Tensor): [description] + aux_input (Dict): Auxiliary inputs like speaker embeddings, durations etc. + + Returns: + Dict: [description] + """ + outputs_dict = {"model_outputs": None} + ... + return outputs_dict + + @abstractmethod + def train_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: + """Perform a single training step. Run the model forward pass and compute losses. + + Args: + batch (Dict): Input tensors. + criterion (nn.Module): Loss layer designed for the model. + + Returns: + Tuple[Dict, Dict]: Model ouputs and computed losses. + """ + outputs_dict = {} + loss_dict = {} # this returns from the criterion + ... + return outputs_dict, loss_dict + + def train_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + """Create visualizations and waveform examples for training. + + For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to + be projected onto Tensorboard. + + Args: + ap (AudioProcessor): audio processor used at training. + batch (Dict): Model inputs used at the previous training step. + outputs (Dict): Model outputs generated at the previoud training step. + + Returns: + Tuple[Dict, np.ndarray]: training plots and output waveform. + """ + return None, None + + @abstractmethod + def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: + """Perform a single evaluation step. Run the model forward pass and compute losses. In most cases, you can + call `train_step()` with no changes. + + Args: + batch (Dict): Input tensors. + criterion (nn.Module): Loss layer designed for the model. + + Returns: + Tuple[Dict, Dict]: Model ouputs and computed losses. + """ + outputs_dict = {} + loss_dict = {} # this returns from the criterion + ... + return outputs_dict, loss_dict + + def eval_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + """The same as `train_log()`""" + return None, None + + @abstractmethod + def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False) -> None: + """Load a checkpoint and get ready for training or inference. + + Args: + config (Coqpit): Model configuration. + checkpoint_path (str): Path to the model checkpoint file. + eval (bool, optional): If true, init model for inference else for training. Defaults to False. + """ + ... + + def get_optimizer(self) -> Union["Optimizer", List["Optimizer"]]: + """Setup an return optimizer or optimizers.""" + pass + + def get_lr(self) -> Union[float, List[float]]: + """Return learning rate(s). + + Returns: + Union[float, List[float]]: Model's initial learning rates. + """ + pass + + def get_scheduler(self, optimizer: torch.optim.Optimizer): + pass + + def get_criterion(self): + pass + + def format_batch(self): + pass diff --git a/TTS/trainer.py b/TTS/trainer.py new file mode 100644 index 00000000..c56be140 --- /dev/null +++ b/TTS/trainer.py @@ -0,0 +1,1067 @@ +# -*- coding: utf-8 -*- + +import glob +import importlib +import logging +import os +import platform +import re +import sys +import time +import traceback +from argparse import Namespace +from dataclasses import dataclass, field +from typing import Dict, List, Tuple, Union + +import torch +from coqpit import Coqpit +from torch import nn +from torch.nn.parallel import DistributedDataParallel as DDP_th +from torch.utils.data import DataLoader + +from TTS.config import load_config, register_config +from TTS.tts.datasets import load_meta_data +from TTS.tts.models import setup_model as setup_tts_model +from TTS.tts.utils.text.symbols import parse_symbols +from TTS.utils.audio import AudioProcessor +from TTS.utils.callbacks import TrainerCallback +from TTS.utils.distribute import init_distributed +from TTS.utils.generic_utils import ( + KeepAverage, + count_parameters, + create_experiment_folder, + get_git_branch, + remove_experiment_folder, + set_init_dict, + to_cuda, +) +from TTS.utils.io import copy_model_files, save_best_model, save_checkpoint +from TTS.utils.logging import ConsoleLogger, TensorboardLogger +from TTS.utils.trainer_utils import get_optimizer, get_scheduler, is_apex_available, setup_torch_training_env +from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data +from TTS.vocoder.models import setup_model as setup_vocoder_model + +if platform.system() != "Windows": + # https://github.com/pytorch/pytorch/issues/973 + import resource + + rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) + resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1])) + +if is_apex_available(): + from apex import amp + + +@dataclass +class TrainingArgs(Coqpit): + """Trainer arguments to be defined externally. It helps integrating the `Trainer` with the higher level APIs and + set the values for distributed training.""" + + continue_path: str = field( + default="", + metadata={ + "help": "Path to a training folder to continue training. Restore the model from the last checkpoint and continue training under the same folder." + }, + ) + restore_path: str = field( + default="", + metadata={ + "help": "Path to a model checkpoit. Restore the model with the given checkpoint and start a new training." + }, + ) + best_path: str = field( + default="", + metadata={ + "help": "Best model file to be used for extracting best loss. If not specified, the latest best model in continue path is used" + }, + ) + config_path: str = field(default="", metadata={"help": "Path to the configuration file."}) + rank: int = field(default=0, metadata={"help": "Process rank in distributed training."}) + group_id: str = field(default="", metadata={"help": "Process group id in distributed training."}) + + +class Trainer: + def __init__( + self, + args: Union[Coqpit, Namespace], + config: Coqpit, + output_path: str, + c_logger: ConsoleLogger = None, + tb_logger: TensorboardLogger = None, + model: nn.Module = None, + cudnn_benchmark: bool = False, + ) -> None: + """Simple yet powerful 🐸💬 TTS trainer for PyTorch. It can train all the available `tts` and `vocoder` models + or easily be customized. + + Notes: + + Supports Automatic Mixed Precision training. If `Apex` is availabe, it automatically picks that, else + it uses PyTorch's native `amp` module. `Apex` may provide more stable training in some cases. + + Args: + + args (Union[Coqpit, Namespace]): Training arguments parsed either from console by `argparse` or `TrainingArgs` + config object. + + config (Coqpit): Model config object. It includes all the values necessary for initializing, training, evaluating + and testing the model. + + output_path (str): Path to the output training folder. All the files are saved under thi path. + + c_logger (ConsoleLogger, optional): Console logger for printing training status. If not provided, the default + console logger is used. Defaults to None. + + tb_logger (TensorboardLogger, optional): Tensorboard logger. If not provided, the default logger is used. + Defaults to None. + + model (nn.Module, optional): Initialized and ready-to-train model. If it is not defined, `Trainer` + initializes a model from the provided config. Defaults to None. + + cudnn_benchmark (bool): enable/disable PyTorch cudnn benchmarking. It is better to disable if the model input + length is changing batch to batch along the training. + + Examples: + + Running trainer on a model. + + >>> args = TrainingArgs(...) + >>> config = HifiganConfig(...) + >>> model = GANModel(config) + >>> trainer = Trainer(args, config, output_path, model=model) + >>> trainer.fit() + + Running trainer on a config. + + >>> config = WavegradConfig(data_path="/home/erogol/nvme/gdrive/Datasets/LJSpeech-1.1/wavs/", output_path=output_path,) + >>> args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) + >>> trainer = Trainer(args, config, output_path, c_logger, tb_logger) + >>> trainer.fit() + + TODO: + - Accumulate gradients b/w batches. + - Deepspeed integration + - Profiler integration. + - Overfitting to a batch. + - TPU training + """ + + # set and initialize Pytorch runtime + self.use_cuda, self.num_gpus = setup_torch_training_env(True, cudnn_benchmark) + + if config is None: + # parse config from console arguments + config, output_path, _, c_logger, tb_logger = process_args(args) + + self.output_path = output_path + self.args = args + self.config = config + + # init loggers + self.c_logger = ConsoleLogger() if c_logger is None else c_logger + if tb_logger is None: + self.tb_logger = TensorboardLogger(output_path, model_name=config.model) + self.tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) + else: + self.tb_logger = tb_logger + log_file = os.path.join(self.output_path, f"trainer_{args.rank}_log.txt") + self._setup_logger_config(log_file) + + self.total_steps_done = 0 + self.epochs_done = 0 + self.restore_step = 0 + self.best_loss = float("inf") + self.train_loader = None + self.eval_loader = None + self.output_audio_path = os.path.join(output_path, "test_audios") + + self.keep_avg_train = None + self.keep_avg_eval = None + + self.use_apex = self._is_apex_available() + self.use_amp_scaler = self.config.mixed_precision and self.use_cuda + + # init audio processor + self.ap = AudioProcessor(**self.config.audio.to_dict()) + + # load dataset samples + # TODO: refactor this + if "datasets" in self.config: + # load data for `tts` models + self.data_train, self.data_eval = load_meta_data(self.config.datasets) + elif self.config.feature_path is not None: + # load data for `vocoder`models + print(f" > Loading features from: {self.config.feature_path}") + self.data_eval, self.data_train = load_wav_feat_data( + self.config.data_path, self.config.feature_path, self.config.eval_split_size + ) + else: + # load data for `vocoder`models + self.data_eval, self.data_train = load_wav_data(self.config.data_path, self.config.eval_split_size) + + # init TTS model + if model is not None: + self.model = model + else: + self.model = self.get_model(self.config) + + # setup criterion + self.criterion = self.get_criterion(self.model) + + # DISTRUBUTED + if self.num_gpus > 1: + init_distributed( + args.rank, + self.num_gpus, + args.group_id, + self.config.distributed_backend, + self.config.distributed_url, + ) + + if self.use_cuda: + self.model.cuda() + if isinstance(self.criterion, list): + self.criterion = [x.cuda() for x in self.criterion] + else: + self.criterion.cuda() + + # setup optimizer + self.optimizer = self.get_optimizer(self.model, self.config) + + # callback + self.callbacks = TrainerCallback(self) + self.callbacks.on_init_start() + + # init AMP + if self.use_amp_scaler: + if self.use_apex: + self.scaler = None + self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level="O1") + if isinstance(self.optimizer, list): + self.scaler = [torch.cuda.amp.GradScaler()] * len(self.optimizer) + else: + self.scaler = torch.cuda.amp.GradScaler() + else: + self.scaler = None + + if self.args.restore_path: + self.model, self.optimizer, self.scaler, self.restore_step = self.restore_model( + self.config, args.restore_path, self.model, self.optimizer, self.scaler + ) + + # setup scheduler + self.scheduler = self.get_scheduler(self.model, self.config, self.optimizer) + + # DISTRUBUTED + if self.num_gpus > 1: + self.model = DDP_th(self.model, device_ids=[args.rank], output_device=args.rank) + + # count model size + num_params = count_parameters(self.model) + print("\n > Model has {} parameters".format(num_params)) + + self.callbacks.on_init_end() + + @staticmethod + def get_model(config: Coqpit) -> nn.Module: + """Initialize model from config. + + Args: + config (Coqpit): Model config. + + Returns: + nn.Module: initialized model. + """ + # TODO: better model setup + try: + model = setup_tts_model(config) + except ModuleNotFoundError: + model = setup_vocoder_model(config) + return model + + def restore_model( + self, + config: Coqpit, + restore_path: str, + model: nn.Module, + optimizer: torch.optim.Optimizer, + scaler: torch.cuda.amp.GradScaler = None, + ) -> Tuple[nn.Module, torch.optim.Optimizer, torch.cuda.amp.GradScaler, int]: + """Restore training from an old run. It restores model, optimizer, AMP scaler and training stats. + + Args: + config (Coqpit): Model config. + restore_path (str): Path to the restored training run. + model (nn.Module): Model to restored. + optimizer (torch.optim.Optimizer): Optimizer to restore. + scaler (torch.cuda.amp.GradScaler, optional): AMP scaler to restore. Defaults to None. + + Returns: + Tuple[nn.Module, torch.optim.Optimizer, torch.cuda.amp.GradScaler, int]: [description] + """ + + def _restore_list_objs(states, obj): + if isinstance(obj, list): + for idx, state in enumerate(states): + obj[idx].load_state_dict(state) + else: + obj.load_state_dict(states) + return obj + + print(" > Restoring from %s ..." % os.path.basename(restore_path)) + checkpoint = torch.load(restore_path) + try: + print(" > Restoring Model...") + model.load_state_dict(checkpoint["model"]) + print(" > Restoring Optimizer...") + optimizer = _restore_list_objs(checkpoint["optimizer"], optimizer) + if "scaler" in checkpoint and self.use_amp_scaler and checkpoint["scaler"]: + print(" > Restoring AMP Scaler...") + scaler = _restore_list_objs(checkpoint["scaler"], scaler) + except (KeyError, RuntimeError): + print(" > Partial model initialization...") + model_dict = model.state_dict() + model_dict = set_init_dict(model_dict, checkpoint["model"], config) + model.load_state_dict(model_dict) + del model_dict + + if isinstance(self.optimizer, list): + for idx, optim in enumerate(optimizer): + for group in optim.param_groups: + group["lr"] = self.get_lr(model, config)[idx] + else: + for group in optimizer.param_groups: + group["lr"] = self.get_lr(model, config) + print( + " > Model restored from step %d" % checkpoint["step"], + ) + restore_step = checkpoint["step"] + return model, optimizer, scaler, restore_step + + @staticmethod + def _get_loader( + model: nn.Module, + config: Coqpit, + ap: AudioProcessor, + is_eval: bool, + data_items: List, + verbose: bool, + num_gpus: int, + ) -> DataLoader: + if hasattr(model, "get_data_loader"): + loader = model.get_data_loader(config, ap, is_eval, data_items, verbose, num_gpus) + return loader + + def get_train_dataloader(self, ap: AudioProcessor, data_items: List, verbose: bool) -> DataLoader: + """Initialize and return a training data loader. + + Args: + ap (AudioProcessor): Audio processor. + data_items (List): Data samples used for training. + verbose (bool): enable/disable printing loader stats at initialization. + + Returns: + DataLoader: Initialized training data loader. + """ + return self._get_loader(self.model, self.config, ap, False, data_items, verbose, self.num_gpus) + + def get_eval_dataloader(self, ap: AudioProcessor, data_items: List, verbose: bool) -> DataLoader: + return self._get_loader(self.model, self.config, ap, True, data_items, verbose, self.num_gpus) + + def format_batch(self, batch: List) -> Dict: + """Format the dataloader output and return a batch. + + Args: + batch (List): Batch returned by the dataloader. + + Returns: + Dict: Formatted batch. + """ + batch = self.model.format_batch(batch) + if self.use_cuda: + for k, v in batch.items(): + batch[k] = to_cuda(v) + return batch + + @staticmethod + def _model_train_step( + batch: Dict, model: nn.Module, criterion: nn.Module, optimizer_idx: int = None + ) -> Tuple[Dict, Dict]: + """ + Perform a trainig forward step. Compute model outputs and losses. + + Args: + batch (Dict): [description] + model (nn.Module): [description] + criterion (nn.Module): [description] + optimizer_idx (int, optional): [description]. Defaults to None. + + Returns: + Tuple[Dict, Dict]: [description] + """ + input_args = [batch, criterion] + if optimizer_idx is not None: + input_args.append(optimizer_idx) + # unwrap model in DDP training + if hasattr(model, "module"): + return model.module.train_step(*input_args) + return model.train_step(*input_args) + + def _optimize( + self, + batch: Dict, + model: nn.Module, + optimizer: Union[torch.optim.Optimizer, List], + scaler: "AMPScaler", + criterion: nn.Module, + scheduler: Union[torch.optim.lr_scheduler._LRScheduler, List], # pylint: disable=protected-access + config: Coqpit, + optimizer_idx: int = None, + ) -> Tuple[Dict, Dict, int, torch.Tensor]: + """Perform a forward - backward pass and run the optimizer. + + Args: + batch (Dict): Input batch. If + model (nn.Module): Model for training. Defaults to None. + optimizer (Union[nn.optim.Optimizer, List]): Model's optimizer. If it is a list then, `optimizer_idx` must be defined to indicate the optimizer in use. + scaler (AMPScaler): AMP scaler. + criterion (nn.Module): Model's criterion. + scheduler (Union[torch.optim.lr_scheduler._LRScheduler, List]): LR scheduler used by the optimizer. + config (Coqpit): Model config. + optimizer_idx (int, optional): Target optimizer being used. Defaults to None. + + Raises: + RuntimeError: When the loss is NaN. + + Returns: + Tuple[Dict, Dict, int, torch.Tensor]: model outputs, losses, step time and gradient norm. + """ + step_start_time = time.time() + # zero-out optimizer + optimizer.zero_grad() + with torch.cuda.amp.autocast(enabled=config.mixed_precision): + if optimizer_idx is not None: + outputs, loss_dict = self._model_train_step(batch, model, criterion, optimizer_idx=optimizer_idx) + else: + outputs, loss_dict = self._model_train_step(batch, model, criterion) + + # skip the rest + if outputs is None: + step_time = time.time() - step_start_time + return None, {}, step_time, 0 + + # check nan loss + if torch.isnan(loss_dict["loss"]).any(): + raise RuntimeError(f"Detected NaN loss at step {self.total_steps_done}.") + + # set gradient clipping threshold + if "grad_clip" in config and config.grad_clip is not None: + if optimizer_idx is not None: + grad_clip = config.grad_clip[optimizer_idx] + else: + grad_clip = config.grad_clip + else: + grad_clip = 0.0 # meaning no gradient clipping + + # TODO: compute grad norm + if grad_clip <= 0: + grad_norm = 0 + + # optimizer step + update_lr_scheduler = True + if self.use_amp_scaler: + if self.use_apex: + with amp.scale_loss(loss_dict["loss"], optimizer) as scaled_loss: + scaled_loss.backward() + grad_norm = torch.nn.utils.clip_grad_norm_( + amp.master_params(optimizer), + grad_clip, + ) + else: + # model optimizer step in mixed precision mode + scaler.scale(loss_dict["loss"]).backward() + scaler.unscale_(optimizer) + if grad_clip > 0: + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) + scale_prev = scaler.get_scale() + scaler.step(optimizer) + scaler.update() + update_lr_scheduler = scale_prev <= scaler.get_scale() + else: + # main model optimizer step + loss_dict["loss"].backward() + if grad_clip > 0: + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) + optimizer.step() + + step_time = time.time() - step_start_time + + # setup lr + if scheduler is not None and update_lr_scheduler: + scheduler.step() + + # detach losses + loss_dict = self._detach_loss_dict(loss_dict) + if optimizer_idx is not None: + loss_dict[f"loss_{optimizer_idx}"] = loss_dict.pop("loss") + loss_dict[f"grad_norm_{optimizer_idx}"] = grad_norm + return outputs, loss_dict, step_time, grad_norm + + @staticmethod + def _detach_loss_dict(loss_dict: Dict) -> Dict: + """Detach loss values from autograp. + + Args: + loss_dict (Dict): losses. + + Returns: + Dict: losses detached from autograph. + """ + loss_dict_detached = {} + for key, value in loss_dict.items(): + if isinstance(value, (int, float)): + loss_dict_detached[key] = value + else: + loss_dict_detached[key] = value.item() + return loss_dict_detached + + def train_step(self, batch: Dict, batch_n_steps: int, step: int, loader_start_time: float) -> Tuple[Dict, Dict]: + """Perform a training step on a batch of inputs and log the process. + + Args: + batch (Dict): Input batch. + batch_n_steps (int): Number of steps needed to complete an epoch. Needed for logging. + step (int): Current step number in this epoch. + loader_start_time (float): The time when the data loading is started. Needed for logging. + + Returns: + Tuple[Dict, Dict]: Model outputs and losses. + """ + self.callbacks.on_train_step_start() + # format data + batch = self.format_batch(batch) + loader_time = time.time() - loader_start_time + + # conteainers to hold model outputs and losses for each optimizer. + outputs_per_optimizer = None + log_dict = {} + loss_dict = {} + if not isinstance(self.optimizer, list): + # training with a single optimizer + outputs, loss_dict_new, step_time, grad_norm = self._optimize( + batch, self.model, self.optimizer, self.scaler, self.criterion, self.scheduler, self.config + ) + loss_dict.update(loss_dict_new) + else: + # training with multiple optimizers (e.g. GAN) + outputs_per_optimizer = [None] * len(self.optimizer) + total_step_time = 0 + for idx, optimizer in enumerate(self.optimizer): + criterion = self.criterion + scaler = self.scaler[idx] if self.use_amp_scaler else None + scheduler = self.scheduler[idx] + outputs, loss_dict_new, step_time, grad_norm = self._optimize( + batch, self.model, optimizer, scaler, criterion, scheduler, self.config, idx + ) + # skip the rest if the model returns None + total_step_time += step_time + outputs_per_optimizer[idx] = outputs + # if None, model skipped this optimizer + if loss_dict_new is not None: + loss_dict.update(loss_dict_new) + outputs = outputs_per_optimizer + + # update avg stats + keep_avg_update = dict() + for key, value in log_dict.items(): + keep_avg_update["avg_" + key] = value + keep_avg_update["avg_loader_time"] = loader_time + keep_avg_update["avg_step_time"] = step_time + self.keep_avg_train.update_values(keep_avg_update) + + # print training progress + if self.total_steps_done % self.config.print_step == 0: + # log learning rates + lrs = {} + if isinstance(self.optimizer, list): + for idx, optimizer in enumerate(self.optimizer): + current_lr = self.optimizer[idx].param_groups[0]["lr"] + lrs.update({f"current_lr_{idx}": current_lr}) + else: + current_lr = self.optimizer.param_groups[0]["lr"] + lrs = {"current_lr": current_lr} + log_dict.update(lrs) + if grad_norm > 0: + log_dict.update({"grad_norm": grad_norm}) + # log run-time stats + log_dict.update( + { + "step_time": round(step_time, 4), + "loader_time": round(loader_time, 4), + } + ) + self.c_logger.print_train_step( + batch_n_steps, step, self.total_steps_done, log_dict, loss_dict, self.keep_avg_train.avg_values + ) + + if self.args.rank == 0: + # Plot Training Iter Stats + # reduce TB load and don't log every step + if self.total_steps_done % self.config.tb_plot_step == 0: + iter_stats = log_dict + iter_stats.update(loss_dict) + self.tb_logger.tb_train_step_stats(self.total_steps_done, iter_stats) + if self.total_steps_done % self.config.save_step == 0 and self.total_steps_done != 0: + if self.config.checkpoint: + # checkpoint the model + model_loss = ( + loss_dict[self.config.target_loss] if "target_loss" in self.config else loss_dict["loss"] + ) + save_checkpoint( + self.config, + self.model, + self.optimizer, + self.scaler if self.use_amp_scaler else None, + self.total_steps_done, + self.epochs_done, + self.output_path, + model_loss=model_loss, + ) + # training visualizations + figures, audios = None, None + if hasattr(self.model, "module") and hasattr(self.model.module, "train_log"): + figures, audios = self.model.module.train_log(self.ap, batch, outputs) + elif hasattr(self.model, "train_log"): + figures, audios = self.model.train_log(self.ap, batch, outputs) + if figures is not None: + self.tb_logger.tb_train_figures(self.total_steps_done, figures) + if audios is not None: + self.tb_logger.tb_train_audios(self.total_steps_done, audios, self.ap.sample_rate) + self.total_steps_done += 1 + self.callbacks.on_train_step_end() + return outputs, loss_dict + + def train_epoch(self) -> None: + """Main entry point for the training loop. Run training on the all training samples.""" + self.train_loader = self.get_train_dataloader( + self.ap, + self.data_train, + verbose=True, + ) + self.model.train() + epoch_start_time = time.time() + if self.use_cuda: + batch_num_steps = int(len(self.train_loader.dataset) / (self.config.batch_size * self.num_gpus)) + else: + batch_num_steps = int(len(self.train_loader.dataset) / self.config.batch_size) + self.c_logger.print_train_start() + for cur_step, batch in enumerate(self.train_loader): + loader_start_time = time.time() + _, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time) + epoch_time = time.time() - epoch_start_time + # Plot self.epochs_done Stats + if self.args.rank == 0: + epoch_stats = {"epoch_time": epoch_time} + epoch_stats.update(self.keep_avg_train.avg_values) + self.tb_logger.tb_train_epoch_stats(self.total_steps_done, epoch_stats) + if self.config.tb_model_param_stats: + self.tb_logger.tb_model_weights(self.model, self.total_steps_done) + + @staticmethod + def _model_eval_step( + batch: Dict, model: nn.Module, criterion: nn.Module, optimizer_idx: int = None + ) -> Tuple[Dict, Dict]: + """ + Perform a evaluation forward pass. Compute model outputs and losses with no gradients. + + Args: + batch (Dict): IBatch of inputs. + model (nn.Module): Model to call evaluation. + criterion (nn.Module): Model criterion. + optimizer_idx (int, optional): Optimizer ID to define the closure in multi-optimizer training. Defaults to None. + + Returns: + Tuple[Dict, Dict]: model outputs and losses. + """ + input_args = [batch, criterion] + if optimizer_idx is not None: + input_args.append(optimizer_idx) + if hasattr(model, "module"): + return model.module.eval_step(*input_args) + return model.eval_step(*input_args) + + def eval_step(self, batch: Dict, step: int) -> Tuple[Dict, Dict]: + """Perform a evaluation step on a batch of inputs and log the process. + + Args: + batch (Dict): Input batch. + step (int): Current step number in this epoch. + + Returns: + Tuple[Dict, Dict]: Model outputs and losses. + """ + with torch.no_grad(): + outputs_per_optimizer = None + loss_dict = {} + if not isinstance(self.optimizer, list): + outputs, loss_dict = self._model_eval_step(batch, self.model, self.criterion) + else: + outputs_per_optimizer = [None] * len(self.optimizer) + for idx, _ in enumerate(self.optimizer): + criterion = self.criterion + outputs, loss_dict_new = self._model_eval_step(batch, self.model, criterion, idx) + outputs_per_optimizer[idx] = outputs + if loss_dict_new is not None: + loss_dict.update(loss_dict_new) + outputs = outputs_per_optimizer + + # update avg stats + update_eval_values = dict() + for key, value in loss_dict.items(): + update_eval_values["avg_" + key] = value + self.keep_avg_eval.update_values(update_eval_values) + + if self.config.print_eval: + self.c_logger.print_eval_step(step, loss_dict, self.keep_avg_eval.avg_values) + return outputs, loss_dict + + def eval_epoch(self) -> None: + """Main entry point for the evaluation loop. Run evaluation on the all validation samples.""" + self.eval_loader = ( + self.get_eval_dataloader( + self.ap, + self.data_eval, + verbose=True, + ) + if self.config.run_eval + else None + ) + + self.model.eval() + self.c_logger.print_eval_start() + loader_start_time = time.time() + batch = None + for cur_step, batch in enumerate(self.eval_loader): + # format data + batch = self.format_batch(batch) + loader_time = time.time() - loader_start_time + self.keep_avg_eval.update_values({"avg_loader_time": loader_time}) + outputs, _ = self.eval_step(batch, cur_step) + # plot epoch stats, artifacts and figures + if self.args.rank == 0: + figures, audios = None, None + if hasattr(self.model, "module") and hasattr(self.model.module, "eval_log"): + figures, audios = self.model.module.eval_log(self.ap, batch, outputs) + elif hasattr(self.model, "eval_log"): + figures, audios = self.model.eval_log(self.ap, batch, outputs) + if figures is not None: + self.tb_logger.tb_eval_figures(self.total_steps_done, figures) + if audios is not None: + self.tb_logger.tb_eval_audios(self.total_steps_done, audios, self.ap.sample_rate) + self.tb_logger.tb_eval_stats(self.total_steps_done, self.keep_avg_eval.avg_values) + + def test_run(self) -> None: + """Run test and log the results. Test run must be defined by the model. + Model must return figures and audios to be logged by the Tensorboard.""" + if hasattr(self.model, "test_run"): + if hasattr(self.eval_loader.load_test_samples): + samples = self.eval_loader.load_test_samples(1) + figures, audios = self.model.test_run(samples) + else: + figures, audios = self.model.test_run() + self.tb_logger.tb_test_audios(self.total_steps_done, audios, self.config.audio["sample_rate"]) + self.tb_logger.tb_test_figures(self.total_steps_done, figures) + + def _fit(self) -> None: + """🏃 train -> evaluate -> test for the number of epochs.""" + if self.restore_step != 0 or self.args.best_path: + print(" > Restoring best loss from " f"{os.path.basename(self.args.best_path)} ...") + self.best_loss = torch.load(self.args.best_path, map_location="cpu")["model_loss"] + print(f" > Starting with loaded last best loss {self.best_loss}.") + + self.total_steps_done = self.restore_step + + for epoch in range(0, self.config.epochs): + self.callbacks.on_epoch_start() + self.keep_avg_train = KeepAverage() + self.keep_avg_eval = KeepAverage() if self.config.run_eval else None + self.epochs_done = epoch + self.c_logger.print_epoch_start(epoch, self.config.epochs) + self.train_epoch() + if self.config.run_eval: + self.eval_epoch() + if epoch >= self.config.test_delay_epochs and self.args.rank < 0: + self.test_run() + self.c_logger.print_epoch_end( + epoch, self.keep_avg_eval.avg_values if self.config.run_eval else self.keep_avg_train.avg_values + ) + self.save_best_model() + self.callbacks.on_epoch_end() + + def fit(self) -> None: + """Where the ✨️magic✨️ happens...""" + try: + self._fit() + except KeyboardInterrupt: + self.callbacks.on_keyboard_interrupt() + # if the output folder is empty remove the run. + remove_experiment_folder(self.output_path) + # stop without error signal + try: + sys.exit(0) + except SystemExit: + os._exit(0) # pylint: disable=protected-access + except BaseException: # pylint: disable=broad-except + remove_experiment_folder(self.output_path) + traceback.print_exc() + sys.exit(1) + + def save_best_model(self) -> None: + """Save the best model. It only saves if the current target loss is smaller then the previous.""" + self.best_loss = save_best_model( + self.keep_avg_eval["avg_loss"] if self.keep_avg_eval else self.keep_avg_train["avg_loss"], + self.best_loss, + self.config, + self.model, + self.optimizer, + self.scaler if self.use_amp_scaler else None, + self.total_steps_done, + self.epochs_done, + self.output_path, + keep_all_best=self.config.keep_all_best, + keep_after=self.config.keep_after, + ) + + @staticmethod + def _setup_logger_config(log_file: str) -> None: + logging.basicConfig( + level=logging.INFO, format="", handlers=[logging.FileHandler(log_file), logging.StreamHandler()] + ) + + @staticmethod + def _is_apex_available() -> bool: + """Check if Nvidia's APEX is available.""" + return importlib.util.find_spec("apex") is not None + + @staticmethod + def get_optimizer(model: nn.Module, config: Coqpit) -> Union[torch.optim.Optimizer, List]: + """Receive the optimizer from the model if model implements `get_optimizer()` else + check the optimizer parameters in the config and try initiating the optimizer. + + Args: + model (nn.Module): Training model. + config (Coqpit): Training configuration. + + Returns: + Union[torch.optim.Optimizer, List]: A optimizer or a list of optimizers. GAN models define a list. + """ + if hasattr(model, "get_optimizer"): + optimizer = model.get_optimizer() + if optimizer is None: + optimizer_name = config.optimizer + optimizer_params = config.optimizer_params + return get_optimizer(optimizer_name, optimizer_params, config.lr, model) + return optimizer + + @staticmethod + def get_lr(model: nn.Module, config: Coqpit) -> Union[float, List[float]]: + """Set the initial learning rate by the model if model implements `get_lr()` else try setting the learning rate + fromthe config. + + Args: + model (nn.Module): Training model. + config (Coqpit): Training configuration. + + Returns: + Union[float, List[float]]: A single learning rate or a list of learning rates, one for each optimzier. + """ + lr = None + if hasattr(model, "get_lr"): + lr = model.get_lr() + if lr is None: + lr = config.lr + return lr + + @staticmethod + def get_scheduler( + model: nn.Module, config: Coqpit, optimizer: Union[torch.optim.Optimizer, List] + ) -> Union[torch.optim.lr_scheduler._LRScheduler, List]: # pylint: disable=protected-access + """Receive the scheduler from the model if model implements `get_scheduler()` else + check the config and try initiating the scheduler. + + Args: + model (nn.Module): Training model. + config (Coqpit): Training configuration. + + Returns: + Union[torch.optim.Optimizer, List]: A scheduler or a list of schedulers, one for each optimizer. + """ + scheduler = None + if hasattr(model, "get_scheduler"): + scheduler = model.get_scheduler(optimizer) + if scheduler is None: + lr_scheduler = config.lr_scheduler + lr_scheduler_params = config.lr_scheduler_params + return get_scheduler(lr_scheduler, lr_scheduler_params, optimizer) + return scheduler + + @staticmethod + def get_criterion(model: nn.Module) -> nn.Module: + """Receive the criterion from the model. Model must implement `get_criterion()`. + + Args: + model (nn.Module): Training model. + + Returns: + nn.Module: Criterion layer. + """ + criterion = None + criterion = model.get_criterion() + return criterion + + +def init_arguments(): + train_config = TrainingArgs() + parser = train_config.init_argparse(arg_prefix="") + return parser + + +def get_last_checkpoint(path): + """Get latest checkpoint or/and best model in path. + + It is based on globbing for `*.pth.tar` and the RegEx + `(checkpoint|best_model)_([0-9]+)`. + + Args: + path (list): Path to files to be compared. + + Raises: + ValueError: If no checkpoint or best_model files are found. + + Returns: + last_checkpoint (str): Last checkpoint filename. + """ + file_names = glob.glob(os.path.join(path, "*.pth.tar")) + last_models = {} + last_model_nums = {} + for key in ["checkpoint", "best_model"]: + last_model_num = None + last_model = None + # pass all the checkpoint files and find + # the one with the largest model number suffix. + for file_name in file_names: + match = re.search(f"{key}_([0-9]+)", file_name) + if match is not None: + model_num = int(match.groups()[0]) + if last_model_num is None or model_num > last_model_num: + last_model_num = model_num + last_model = file_name + + # if there is not checkpoint found above + # find the checkpoint with the latest + # modification date. + key_file_names = [fn for fn in file_names if key in fn] + if last_model is None and len(key_file_names) > 0: + last_model = max(key_file_names, key=os.path.getctime) + last_model_num = torch.load(last_model)["step"] + + if last_model is not None: + last_models[key] = last_model + last_model_nums[key] = last_model_num + + # check what models were found + if not last_models: + raise ValueError(f"No models found in continue path {path}!") + if "checkpoint" not in last_models: # no checkpoint just best model + last_models["checkpoint"] = last_models["best_model"] + elif "best_model" not in last_models: # no best model + # this shouldn't happen, but let's handle it just in case + last_models["best_model"] = None + # finally check if last best model is more recent than checkpoint + elif last_model_nums["best_model"] > last_model_nums["checkpoint"]: + last_models["checkpoint"] = last_models["best_model"] + + return last_models["checkpoint"], last_models["best_model"] + + +def process_args(args, config=None): + """Process parsed comand line arguments and initialize the config if not provided. + + Args: + args (argparse.Namespace or dict like): Parsed input arguments. + config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None. + + Returns: + c (TTS.utils.io.AttrDict): Config paramaters. + out_path (str): Path to save models and logging. + audio_path (str): Path to save generated test audios. + c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does + logging to the console. + tb_logger (TTS.utils.tensorboard.TensorboardLogger): Class that does + the TensorBoard logging. + + TODO: + - Interactive config definition. + """ + if isinstance(args, tuple): + args, coqpit_overrides = args + if args.continue_path: + # continue a previous training from its output folder + experiment_path = args.continue_path + args.config_path = os.path.join(args.continue_path, "config.json") + args.restore_path, best_model = get_last_checkpoint(args.continue_path) + if not args.best_path: + args.best_path = best_model + # init config if not already defined + if config is None: + if args.config_path: + # init from a file + config = load_config(args.config_path) + else: + # init from console args + from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel + + config_base = BaseTrainingConfig() + config_base.parse_known_args(coqpit_overrides) + config = register_config(config_base.model)() + # override values from command-line args + config.parse_known_args(coqpit_overrides, relaxed_parser=True) + if config.mixed_precision: + print(" > Mixed precision mode is ON") + experiment_path = args.continue_path + if not experiment_path: + experiment_path = create_experiment_folder(config.output_path, config.run_name) + audio_path = os.path.join(experiment_path, "test_audios") + # setup rank 0 process in distributed training + tb_logger = None + if args.rank == 0: + os.makedirs(audio_path, exist_ok=True) + new_fields = {} + if args.restore_path: + new_fields["restore_path"] = args.restore_path + new_fields["github_branch"] = get_git_branch() + # if model characters are not set in the config file + # save the default set to the config file for future + # compatibility. + if config.has("characters_config"): + used_characters = parse_symbols() + new_fields["characters"] = used_characters + copy_model_files(config, experiment_path, new_fields) + os.chmod(audio_path, 0o775) + os.chmod(experiment_path, 0o775) + tb_logger = TensorboardLogger(experiment_path, model_name=config.model) + # write model desc to tensorboard + tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) + c_logger = ConsoleLogger() + return config, experiment_path, audio_path, c_logger, tb_logger + + +def init_training(argv: Union[List, Coqpit], config: Coqpit = None): + """Initialization of a training run.""" + if isinstance(argv, Coqpit): + parser = argv.init_argparse(arg_prefix="") + else: + parser = init_arguments() + args = parser.parse_known_args() + config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(args, config) + return args[0], config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger diff --git a/TTS/tts/configs/align_tts_config.py b/TTS/tts/configs/align_tts_config.py index 2956d935..837cd519 100644 --- a/TTS/tts/configs/align_tts_config.py +++ b/TTS/tts/configs/align_tts_config.py @@ -2,6 +2,7 @@ from dataclasses import dataclass, field from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig +from TTS.tts.models.align_tts import AlignTTSArgs @dataclass @@ -49,9 +50,9 @@ class AlignTTSConfig(BaseTTSConfig): use_speaker_embedding (bool): enable / disable using speaker embeddings for multi-speaker models. If set True, the model is in the multi-speaker mode. Defaults to False. - use_external_speaker_embedding_file (bool): + use_d_vector_file (bool): enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. - external_speaker_embedding_file (str): + d_vector_file (str): Path to the file including pre-computed speaker embeddings. Defaults to None. noam_schedule (bool): enable / disable the use of Noam LR scheduler. Defaults to False. @@ -68,17 +69,7 @@ class AlignTTSConfig(BaseTTSConfig): model: str = "align_tts" # model specific params - positional_encoding: bool = True - hidden_channels_dp: int = 256 - hidden_channels: int = 256 - encoder_type: str = "fftransformer" - encoder_params: dict = field( - default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1} - ) - decoder_type: str = "fftransformer" - decoder_params: dict = field( - default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1} - ) + model_args: AlignTTSArgs = field(default_factory=AlignTTSArgs) phase_start_steps: List[int] = None ssim_alpha: float = 1.0 @@ -88,17 +79,29 @@ class AlignTTSConfig(BaseTTSConfig): # multi-speaker settings use_speaker_embedding: bool = False - use_external_speaker_embedding_file: bool = False - external_speaker_embedding_file: str = False + use_d_vector_file: bool = False + d_vector_file: str = False # optimizer parameters - noam_schedule: bool = False - warmup_steps: int = 4000 + optimizer: str = "Adam" + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6}) + lr_scheduler: str = None + lr_scheduler_params: dict = None lr: float = 1e-4 - wd: float = 1e-6 grad_clip: float = 5.0 # overrides min_seq_len: int = 13 max_seq_len: int = 200 r: int = 1 + + # testing + test_sentences: List[str] = field( + default_factory=lambda: [ + "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + "Be a voice, not an echo.", + "I'm sorry Dave. I'm afraid I can't do that.", + "This cake is great. It's so delicious and moist.", + "Prior to November 22, 1963.", + ] + ) diff --git a/TTS/tts/configs/glow_tts_config.py b/TTS/tts/configs/glow_tts_config.py index 36ccb612..caf2f71b 100644 --- a/TTS/tts/configs/glow_tts_config.py +++ b/TTS/tts/configs/glow_tts_config.py @@ -7,7 +7,7 @@ from TTS.tts.configs.shared_configs import BaseTTSConfig class GlowTTSConfig(BaseTTSConfig): """Defines parameters for GlowTTS model. - Example: + Example: >>> from TTS.tts.configs import GlowTTSConfig >>> config = GlowTTSConfig() @@ -23,13 +23,49 @@ class GlowTTSConfig(BaseTTSConfig): Defaults to `{"kernel_size": 3, "dropout_p": 0.1, "num_layers": 6, "num_heads": 2, "hidden_channels_ffn": 768}` use_encoder_prenet (bool): enable / disable the use of a prenet for the encoder. Defaults to True. - hidden_channels_encoder (int): + hidden_channels_enc (int): Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes, and for some encoder types internal hidden channels sizes too. Defaults to 192. - hidden_channels_decoder (int): + hidden_channels_dec (int): Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work. - hidden_channels_duration_predictor (int): + hidden_channels_dp (int): Number of layer channels of the duration predictor network. Defaults to 256 as in the original work. + mean_only (bool): + If true predict only the mean values by the decoder flow. Defaults to True. + out_channels (int): + Number of channels of the model output tensor. Defaults to 80. + num_flow_blocks_dec (int): + Number of decoder blocks. Defaults to 12. + inference_noise_scale (float): + Noise scale used at inference. Defaults to 0.33. + kernel_size_dec (int): + Decoder kernel size. Defaults to 5 + dilation_rate (int): + Rate to increase dilation by each layer in a decoder block. Defaults to 1. + num_block_layers (int): + Number of decoder layers in each decoder block. Defaults to 4. + dropout_p_dec (float): + Dropout rate for decoder. Defaults to 0.1. + num_speaker (int): + Number of speaker to define the size of speaker embedding layer. Defaults to 0. + c_in_channels (int): + Number of speaker embedding channels. It is set to 512 if embeddings are learned. Defaults to 0. + num_splits (int): + Number of split levels in inversible conv1x1 operation. Defaults to 4. + num_squeeze (int): + Number of squeeze levels. When squeezing channels increases and time steps reduces by the factor + 'num_squeeze'. Defaults to 2. + sigmoid_scale (bool): + enable/disable sigmoid scaling in decoder. Defaults to False. + mean_only (bool): + If True, encoder only computes mean value and uses constant variance for each time step. Defaults to true. + encoder_type (str): + Encoder module type. Possible values are`["rel_pos_transformer", "gated_conv", "residual_conv_bn", "time_depth_separable"]` + Check `TTS.tts.layers.glow_tts.encoder` for more details. Defaults to `rel_pos_transformers` as in the original paper. + encoder_params (dict): + Encoder module parameters. Defaults to None. + d_vector_dim (int): + Channels of external speaker embedding vectors. Defaults to 0. data_dep_init_steps (int): Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses Activation Normalization that pre-computes normalization stats at the beginning and use the same values @@ -38,12 +74,14 @@ class GlowTTSConfig(BaseTTSConfig): Path to the wav file used for changing the style of the speech. Defaults to None. inference_noise_scale (float): Variance used for sampling the random noise added to the decoder's input at inference. Defaults to 0.0. + length_scale (float): + Multiply the predicted durations with this value to change the speech speed. Defaults to 1. use_speaker_embedding (bool): enable / disable using speaker embeddings for multi-speaker models. If set True, the model is in the multi-speaker mode. Defaults to False. - use_external_speaker_embedding_file (bool): + use_d_vector_file (bool): enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. - external_speaker_embedding_file (str): + d_vector_file (str): Path to the file including pre-computed speaker embeddings. Defaults to None. noam_schedule (bool): enable / disable the use of Noam LR scheduler. Defaults to False. @@ -62,6 +100,7 @@ class GlowTTSConfig(BaseTTSConfig): model: str = "glow_tts" # model params + num_chars: int = None encoder_type: str = "rel_pos_transformer" encoder_params: dict = field( default_factory=lambda: { @@ -73,9 +112,35 @@ class GlowTTSConfig(BaseTTSConfig): } ) use_encoder_prenet: bool = True - hidden_channels_encoder: int = 192 - hidden_channels_decoder: int = 192 - hidden_channels_duration_predictor: int = 256 + hidden_channels_enc: int = 192 + hidden_channels_dec: int = 192 + hidden_channels_dp: int = 256 + dropout_p_dp: float = 0.1 + dropout_p_dec: float = 0.05 + mean_only: bool = True + out_channels: int = 80 + num_flow_blocks_dec: int = 12 + inference_noise_scale: float = 0.33 + kernel_size_dec: int = 5 + dilation_rate: int = 1 + num_block_layers: int = 4 + num_speakers: int = 0 + c_in_channels: int = 0 + num_splits: int = 4 + num_squeeze: int = 2 + sigmoid_scale: bool = False + encoder_type: str = "rel_pos_transformer" + encoder_params: dict = field( + default_factory=lambda: { + "kernel_size": 3, + "dropout_p": 0.1, + "num_layers": 6, + "num_heads": 2, + "hidden_channels_ffn": 768, + "input_length": None, + } + ) + d_vector_dim: int = 0 # training params data_dep_init_steps: int = 10 @@ -83,18 +148,20 @@ class GlowTTSConfig(BaseTTSConfig): # inference params style_wav_for_test: str = None inference_noise_scale: float = 0.0 + length_scale: float = 1.0 # multi-speaker settings use_speaker_embedding: bool = False - use_external_speaker_embedding_file: bool = False - external_speaker_embedding_file: str = False + use_d_vector_file: bool = False + d_vector_file: str = False - # optimizer params - noam_schedule: bool = True - warmup_steps: int = 4000 + # optimizer parameters + optimizer: str = "RAdam" + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6}) + lr_scheduler: str = "NoamLR" + lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000}) grad_clip: float = 5.0 lr: float = 1e-3 - wd: float = 0.000001 # overrides min_seq_len: int = 3 diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index a501a880..4b916a17 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -1,7 +1,7 @@ from dataclasses import asdict, dataclass, field from typing import List -from coqpit import MISSING, Coqpit, check_argument +from coqpit import Coqpit, check_argument from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig @@ -133,6 +133,18 @@ class BaseTTSConfig(BaseTrainingConfig): datasets (List[BaseDatasetConfig]): List of datasets used for training. If multiple datasets are provided, they are merged and used together for training. + optimizer (str): + Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`. + Defaults to ``. + optimizer_params (dict): + Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}` + lr_scheduler (str): + Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or + `TTS.utils.training`. Defaults to ``. + lr_scheduler_params (dict): + Parameters for the generator learning rate scheduler. Defaults to `{"warmup": 4000}`. + test_sentences (List[str]): + List of sentences to be used at testing. Defaults to '[]' """ audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) @@ -141,7 +153,7 @@ class BaseTTSConfig(BaseTrainingConfig): use_espeak_phonemes: bool = True phoneme_language: str = None compute_input_seq_cache: bool = False - text_cleaner: str = MISSING + text_cleaner: str = None enable_eos_bos_chars: bool = False test_sentences_file: str = "" phoneme_cache_path: str = None @@ -158,3 +170,15 @@ class BaseTTSConfig(BaseTrainingConfig): add_blank: bool = False # dataset datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) + # optimizer + optimizer: str = None + optimizer_params: dict = None + # scheduler + lr_scheduler: str = "" + lr_scheduler_params: dict = field(default_factory=lambda: {}) + # testing + test_sentences: List[str] = field(default_factory=lambda: []) + # multi-speaker + use_speaker_embedding: bool = False + use_d_vector_file: bool = False + d_vector_dim: int = 0 diff --git a/TTS/tts/configs/speedy_speech_config.py b/TTS/tts/configs/speedy_speech_config.py index 1b8f0c82..b2641ab5 100644 --- a/TTS/tts/configs/speedy_speech_config.py +++ b/TTS/tts/configs/speedy_speech_config.py @@ -1,6 +1,8 @@ from dataclasses import dataclass, field +from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig +from TTS.tts.models.speedy_speech import SpeedySpeechArgs @dataclass @@ -15,30 +17,8 @@ class SpeedySpeechConfig(BaseTTSConfig): Args: model (str): Model name used for selecting the right model at initialization. Defaults to `speedy_speech`. - positional_encoding (bool): - enable / disable positional encoding applied to the encoder output. Defaults to True. - hidden_channels (int): - Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder - parameters. Defaults to 128. - encoder_type (str): - Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details. - Defaults to `residual_conv_bn`. - encoder_params (dict): - Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details. - Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1], "num_conv_blocks": 2, "num_res_blocks": 13}` - decoder_type (str): - Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details. - Defaults to `residual_conv_bn`. - decoder_params (dict): - Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details. - Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1], "num_conv_blocks": 2, "num_res_blocks": 17}` - hidden_channels_encoder (int): - Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes, - and for some encoder types internal hidden channels sizes too. Defaults to 192. - hidden_channels_decoder (int): - Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work. - hidden_channels_duration_predictor (int): - Number of layer channels of the duration predictor network. Defaults to 256 as in the original work. + model_args (Coqpit): + Model class arguments. Check `SpeedySpeechArgs` for more details. Defaults to `SpeedySpeechArgs()`. data_dep_init_steps (int): Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses Activation Normalization that pre-computes normalization stats at the beginning and use the same values @@ -46,9 +26,9 @@ class SpeedySpeechConfig(BaseTTSConfig): use_speaker_embedding (bool): enable / disable using speaker embeddings for multi-speaker models. If set True, the model is in the multi-speaker mode. Defaults to False. - use_external_speaker_embedding_file (bool): + use_d_vector_file (bool): enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. - external_speaker_embedding_file (str): + d_vector_file (str): Path to the file including pre-computed speaker embeddings. Defaults to None. noam_schedule (bool): enable / disable the use of Noam LR scheduler. Defaults to False. @@ -72,37 +52,19 @@ class SpeedySpeechConfig(BaseTTSConfig): model: str = "speedy_speech" # model specific params - positional_encoding: bool = True - hidden_channels: int = 128 - encoder_type: str = "residual_conv_bn" - encoder_params: dict = field( - default_factory=lambda: { - "kernel_size": 4, - "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1], - "num_conv_blocks": 2, - "num_res_blocks": 13, - } - ) - decoder_type: str = "residual_conv_bn" - decoder_params: dict = field( - default_factory=lambda: { - "kernel_size": 4, - "dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1], - "num_conv_blocks": 2, - "num_res_blocks": 17, - } - ) + model_args: SpeedySpeechArgs = field(default_factory=SpeedySpeechArgs) # multi-speaker settings use_speaker_embedding: bool = False - use_external_speaker_embedding_file: bool = False - external_speaker_embedding_file: str = False + use_d_vector_file: bool = False + d_vector_file: str = False # optimizer parameters - noam_schedule: bool = False - warmup_steps: int = 4000 + optimizer: str = "RAdam" + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6}) + lr_scheduler: str = None + lr_scheduler_params: dict = None lr: float = 1e-4 - wd: float = 1e-6 grad_clip: float = 5.0 # loss params @@ -114,3 +76,14 @@ class SpeedySpeechConfig(BaseTTSConfig): min_seq_len: int = 13 max_seq_len: int = 200 r: int = 1 # DO NOT CHANGE + + # testing + test_sentences: List[str] = field( + default_factory=lambda: [ + "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + "Be a voice, not an echo.", + "I'm sorry Dave. I'm afraid I can't do that.", + "This cake is great. It's so delicious and moist.", + "Prior to November 22, 1963.", + ] + ) diff --git a/TTS/tts/configs/tacotron2_config.py b/TTS/tts/configs/tacotron2_config.py index ea66fae8..b622e640 100644 --- a/TTS/tts/configs/tacotron2_config.py +++ b/TTS/tts/configs/tacotron2_config.py @@ -12,107 +12,10 @@ class Tacotron2Config(TacotronConfig): >>> from TTS.tts.configs import Tacotron2Config >>> config = Tacotron2Config() - Args: - model (str): - Model name used to select the right model class to initilize. Defaults to `Tacotron2`. - use_gst (bool): - enable / disable the use of Global Style Token modules. Defaults to False. - gst (GSTConfig): - Instance of `GSTConfig` class. - gst_style_input (str): - Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and - this is not defined, the model uses a zero vector as an input. Defaults to None. - r (int): - Number of output frames that the decoder computed per iteration. Larger values makes training and inference - faster but reduces the quality of the output frames. This needs to be tuned considering your own needs. - Defaults to 1. - gradual_trainin (List[List]): - Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is - the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size. - If sets None, no gradual training is used. Defaults to None. - memory_size (int): - Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame. - Defaults to -1. - prenet_type (str): - `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the - Prenet. Defaults to `original`. - prenet_dropout (bool): - enables / disables the use of dropout in the Prenet. Defaults to True. - prenet_dropout_at_inference (bool): - enable / disable the use of dropout in the Prenet at the inference time. Defaults to False. - stopnet (bool): - enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True. - stopnet_pos_weight (float): - Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with - datasets with longer sentences. Defaults to 10. - separate_stopnet (bool): - Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True. - attention_type (str): - attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'. - attention_heads (int): - Number of attention heads for GMM attention. Defaults to 5. - windowing (bool): - It especially useful at inference to keep attention alignment diagonal. Defaults to False. - use_forward_attn (bool): - It is only valid if ```attn_type``` is ```original```. Defaults to False. - forward_attn_mask (bool): - enable/disable extra masking over forward attention. It is useful at inference to prevent - possible attention failures. Defaults to False. - transition_agent (bool): - enable/disable transition agent in forward attention. Defaults to False. - location_attn (bool): - enable/disable location sensitive attention as in the original Tacotron2 paper. - It is only valid if ```attn_type``` is ```original```. Defaults to True. - bidirectional_decoder (bool): - enable/disable bidirectional decoding. Defaults to False. - double_decoder_consistency (bool): - enable/disable double decoder consistency. Defaults to False. - ddc_r (int): - reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this - as a multiple of the `r` value. Defaults to 6. - use_speaker_embedding (bool): - enable / disable using speaker embeddings for multi-speaker models. If set True, the model is - in the multi-speaker mode. Defaults to False. - use_external_speaker_embedding_file (bool): - enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. - external_speaker_embedding_file (str): - Path to the file including pre-computed speaker embeddings. Defaults to None. - noam_schedule (bool): - enable / disable the use of Noam LR scheduler. Defaults to False. - warmup_steps (int): - Number of warm-up steps for the Noam scheduler. Defaults 4000. - lr (float): - Initial learning rate. Defaults to `1e-4`. - wd (float): - Weight decay coefficient. Defaults to `1e-6`. - grad_clip (float): - Gradient clipping threshold. Defaults to `5`. - seq_len_notm (bool): - enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample - is divided by the sequence length. Defaults to False. - loss_masking (bool): - enable / disable masking the paddings of the samples in loss computation. Defaults to True. - decoder_loss_alpha (float): - Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - postnet_loss_alpha (float): - Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - postnet_diff_spec_alpha (float): - Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - decoder_diff_spec_alpha (float): - Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - decoder_ssim_alpha (float): - Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - postnet_ssim_alpha (float): - Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - ga_alpha (float): - Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss - function. Defaults to 5. + Check `TacotronConfig` for argument descriptions. """ model: str = "tacotron2" + out_channels: int = 80 + encoder_in_features: int = 512 + decoder_in_features: int = 512 diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index a567cd88..89fb8d81 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -1,4 +1,4 @@ -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig, GSTConfig @@ -23,6 +23,10 @@ class TacotronConfig(BaseTTSConfig): gst_style_input (str): Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and this is not defined, the model uses a zero vector as an input. Defaults to None. + num_chars (int): + Number of characters used by the model. It must be defined before initializing the model. Defaults to None. + num_speakers (int): + Number of speakers for multi-speaker models. Defaults to 1. r (int): Initial number of output frames that the decoder computed per iteration. Larger values makes training and inference faster but reduces the quality of the output frames. This must be equal to the largest `r` value used in @@ -46,6 +50,14 @@ class TacotronConfig(BaseTTSConfig): stopnet_pos_weight (float): Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with datasets with longer sentences. Defaults to 10. + max_decoder_steps (int): + Max number of steps allowed for the decoder. Defaults to 50. + encoder_in_features (int): + Channels of encoder input and character embedding tensors. Defaults to 256. + decoder_in_features (int): + Channels of decoder input and encoder output tensors. Defaults to 256. + out_channels (int): + Channels of the final model output. It must match the spectragram size. Defaults to 80. separate_stopnet (bool): Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True. attention_type (str): @@ -74,14 +86,20 @@ class TacotronConfig(BaseTTSConfig): use_speaker_embedding (bool): enable / disable using speaker embeddings for multi-speaker models. If set True, the model is in the multi-speaker mode. Defaults to False. - use_external_speaker_embedding_file (bool): + use_d_vector_file (bool): enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. - external_speaker_embedding_file (str): + d_vector_file (str): Path to the file including pre-computed speaker embeddings. Defaults to None. - noam_schedule (bool): - enable / disable the use of Noam LR scheduler. Defaults to False. - warmup_steps (int): - Number of warm-up steps for the Noam scheduler. Defaults 4000. + optimizer (str): + Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`. + Defaults to `RAdam`. + optimizer_params (dict): + Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}` + lr_scheduler (str): + Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or + `TTS.utils.training`. Defaults to `NoamLR`. + lr_scheduler_params (dict): + Parameters for the generator learning rate scheduler. Defaults to `{"warmup": 4000}`. lr (float): Initial learning rate. Defaults to `1e-4`. wd (float): @@ -103,6 +121,7 @@ class TacotronConfig(BaseTTSConfig): Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the corresponding loss function. Defaults to 0.25 decoder_diff_spec_alpha (float): + Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the corresponding loss function. Defaults to 0.25 decoder_ssim_alpha (float): @@ -117,10 +136,14 @@ class TacotronConfig(BaseTTSConfig): """ model: str = "tacotron" + # model_params: TacotronArgs = field(default_factory=lambda: TacotronArgs()) use_gst: bool = False gst: GSTConfig = None gst_style_input: str = None + # model specific params + num_speakers: int = 1 + num_chars: int = 0 r: int = 2 gradual_training: List[List[int]] = None memory_size: int = -1 @@ -130,11 +153,17 @@ class TacotronConfig(BaseTTSConfig): stopnet: bool = True separate_stopnet: bool = True stopnet_pos_weight: float = 10.0 + max_decoder_steps: int = 500 + encoder_in_features: int = 256 + decoder_in_features: int = 256 + decoder_output_dim: int = 80 + out_channels: int = 513 # attention layers attention_type: str = "original" attention_heads: int = None attention_norm: str = "sigmoid" + attention_win: bool = False windowing: bool = False use_forward_attn: bool = False forward_attn_mask: bool = False @@ -148,14 +177,17 @@ class TacotronConfig(BaseTTSConfig): # multi-speaker settings use_speaker_embedding: bool = False - use_external_speaker_embedding_file: bool = False - external_speaker_embedding_file: str = False + speaker_embedding_dim: int = 512 + use_d_vector_file: bool = False + d_vector_file: str = False + d_vector_dim: int = None # optimizer parameters - noam_schedule: bool = False - warmup_steps: int = 4000 + optimizer: str = "RAdam" + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6}) + lr_scheduler: str = "NoamLR" + lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000}) lr: float = 1e-4 - wd: float = 1e-6 grad_clip: float = 5.0 seq_len_norm: bool = False loss_masking: bool = True @@ -169,8 +201,25 @@ class TacotronConfig(BaseTTSConfig): postnet_ssim_alpha: float = 0.25 ga_alpha: float = 5.0 + # testing + test_sentences: List[str] = field( + default_factory=lambda: [ + "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", + "Be a voice, not an echo.", + "I'm sorry Dave. I'm afraid I can't do that.", + "This cake is great. It's so delicious and moist.", + "Prior to November 22, 1963.", + ] + ) + def check_values(self): if self.gradual_training: assert ( self.gradual_training[0][1] == self.r ), f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}" + if self.model == "tacotron" and self.audio is not None: + assert self.out_channels == ( + self.audio.fft_size // 2 + 1 + ), f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}" + if self.model == "tacotron2" and self.audio is not None: + assert self.out_channels == self.audio.num_mels diff --git a/TTS/tts/datasets/TTSDataset.py b/TTS/tts/datasets/TTSDataset.py index 4ca93232..0fc23231 100644 --- a/TTS/tts/datasets/TTSDataset.py +++ b/TTS/tts/datasets/TTSDataset.py @@ -2,6 +2,7 @@ import collections import os import random from multiprocessing import Pool +from typing import Dict, List import numpy as np import torch @@ -10,49 +11,82 @@ from torch.utils.data import Dataset from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor from TTS.tts.utils.text import pad_with_eos_bos, phoneme_to_sequence, text_to_sequence +from TTS.utils.audio import AudioProcessor -class MyDataset(Dataset): +class TTSDataset(Dataset): def __init__( self, - outputs_per_step, - text_cleaner, - compute_linear_spec, - ap, - meta_data, - tp=None, - add_blank=False, - batch_group_size=0, - min_seq_len=0, - max_seq_len=float("inf"), - use_phonemes=False, - phoneme_cache_path=None, - phoneme_language="en-us", - enable_eos_bos=False, - speaker_mapping=None, - use_noise_augment=False, - verbose=False, + outputs_per_step: int, + text_cleaner: list, + compute_linear_spec: bool, + ap: AudioProcessor, + meta_data: List[List], + characters: Dict = None, + add_blank: bool = False, + batch_group_size: int = 0, + min_seq_len: int = 0, + max_seq_len: int = float("inf"), + use_phonemes: bool = False, + phoneme_cache_path: str = None, + phoneme_language: str = "en-us", + enable_eos_bos: bool = False, + speaker_id_mapping: Dict = None, + d_vector_mapping: Dict = None, + use_noise_augment: bool = False, + verbose: bool = False, ): - """ + """Generic 📂 data loader for `tts` models. It is configurable for different outputs and needs. + + If you need something different, you can either override or create a new class as the dataset is + initialized by the model. + Args: - outputs_per_step (int): number of time frames predicted per step. - text_cleaner (str): text cleaner used for the dataset. + outputs_per_step (int): Number of time frames predicted per step. + + text_cleaner (list): List of text cleaners to clean the input text before converting to sequence IDs. + compute_linear_spec (bool): compute linear spectrogram if True. - ap (TTS.tts.utils.AudioProcessor): audio processor object. - meta_data (list): list of dataset instances. - tp (dict): dict of custom text characters used for converting texts to sequences. - batch_group_size (int): (0) range of batch randomization after sorting - sequences by length. - min_seq_len (int): (0) minimum sequence length to be processed - by the loader. - max_seq_len (int): (float("inf")) maximum sequence length. - use_phonemes (bool): (true) if true, text converted to phonemes. - phoneme_cache_path (str): path to cache phoneme features. - phoneme_language (str): one the languages from - https://github.com/bootphon/phonemizer#languages - enable_eos_bos (bool): enable end of sentence and beginning of sentences characters. - use_noise_augment (bool): enable adding random noise to wav for augmentation. - verbose (bool): print diagnostic information. + + ap (TTS.tts.utils.AudioProcessor): Audio processor object. + + meta_data (list): List of dataset instances. + + characters (dict): `dict` of custom text characters used for converting texts to sequences. + + add_blank (bool): Add a special `blank` character after every other character. It helps some + models achieve better results. Defaults to false. + + batch_group_size (int): Range of batch randomization after sorting + sequences by length. It shuffles each batch with bucketing to gather similar lenght sequences in a + batch. Set 0 to disable. Defaults to 0. + + min_seq_len (int): Minimum input sequence length to be processed + by the loader. Filter out input sequences that are shorter than this. Some models have a + minimum input length due to its architecture. Defaults to 0. + + max_seq_len (int): Maximum input sequence length. Filter out input sequences that are longer than this. + It helps for controlling the VRAM usage against long input sequences. Especially models with + RNN layers are sensitive to input length. Defaults to `Inf`. + + use_phonemes (bool): If true, input text converted to phonemes. Defaults to false. + + phoneme_cache_path (str): Path to cache phoneme features. It writes computed phonemes to files to use in + the coming iterations. Defaults to None. + + phoneme_language (str): One the languages from supported by the phonemizer interface. Defaults to `en-us`. + + enable_eos_bos (bool): Enable the `end of sentence` and the `beginning of sentences characters`. Defaults + to False. + + speaker_id_mapping (dict): Mapping of speaker names to IDs used to compute embedding vectors by the + embedding layer. Defaults to None. + + d_vector_mapping (dict): Mapping of wav files to computed d-vectors. Defaults to None. + + use_noise_augment (bool): Enable adding random noise to wav for augmentation. Defaults to False. + + verbose (bool): Print diagnostic information. Defaults to false. """ super().__init__() self.batch_group_size = batch_group_size @@ -64,13 +98,14 @@ class MyDataset(Dataset): self.min_seq_len = min_seq_len self.max_seq_len = max_seq_len self.ap = ap - self.tp = tp + self.characters = characters self.add_blank = add_blank self.use_phonemes = use_phonemes self.phoneme_cache_path = phoneme_cache_path self.phoneme_language = phoneme_language self.enable_eos_bos = enable_eos_bos - self.speaker_mapping = speaker_mapping + self.speaker_id_mapping = speaker_id_mapping + self.d_vector_mapping = d_vector_mapping self.use_noise_augment = use_noise_augment self.verbose = verbose self.input_seq_computed = False @@ -93,13 +128,13 @@ class MyDataset(Dataset): return data @staticmethod - def _generate_and_cache_phoneme_sequence(text, cache_path, cleaners, language, tp, add_blank): + def _generate_and_cache_phoneme_sequence(text, cache_path, cleaners, language, characters, add_blank): """generate a phoneme sequence from text. since the usage is for subsequent caching, we never add bos and eos chars here. Instead we add those dynamically later; based on the config option.""" phonemes = phoneme_to_sequence( - text, [cleaners], language=language, enable_eos_bos=False, tp=tp, add_blank=add_blank + text, [cleaners], language=language, enable_eos_bos=False, tp=characters, add_blank=add_blank ) phonemes = np.asarray(phonemes, dtype=np.int32) np.save(cache_path, phonemes) @@ -107,7 +142,7 @@ class MyDataset(Dataset): @staticmethod def _load_or_generate_phoneme_sequence( - wav_file, text, phoneme_cache_path, enable_eos_bos, cleaners, language, tp, add_blank + wav_file, text, phoneme_cache_path, enable_eos_bos, cleaners, language, characters, add_blank ): file_name = os.path.splitext(os.path.basename(wav_file))[0] @@ -117,16 +152,16 @@ class MyDataset(Dataset): try: phonemes = np.load(cache_path) except FileNotFoundError: - phonemes = MyDataset._generate_and_cache_phoneme_sequence( - text, cache_path, cleaners, language, tp, add_blank + phonemes = TTSDataset._generate_and_cache_phoneme_sequence( + text, cache_path, cleaners, language, characters, add_blank ) except (ValueError, IOError): print(" [!] failed loading phonemes for {}. " "Recomputing.".format(wav_file)) - phonemes = MyDataset._generate_and_cache_phoneme_sequence( - text, cache_path, cleaners, language, tp, add_blank + phonemes = TTSDataset._generate_and_cache_phoneme_sequence( + text, cache_path, cleaners, language, characters, add_blank ) if enable_eos_bos: - phonemes = pad_with_eos_bos(phonemes, tp=tp) + phonemes = pad_with_eos_bos(phonemes, tp=characters) phonemes = np.asarray(phonemes, dtype=np.int32) return phonemes @@ -154,13 +189,14 @@ class MyDataset(Dataset): self.enable_eos_bos, self.cleaners, self.phoneme_language, - self.tp, + self.characters, self.add_blank, ) else: text = np.asarray( - text_to_sequence(text, [self.cleaners], tp=self.tp, add_blank=self.add_blank), dtype=np.int32 + text_to_sequence(text, [self.cleaners], tp=self.characters, add_blank=self.add_blank), + dtype=np.int32, ) assert text.size > 0, self.items[idx][1] @@ -190,7 +226,7 @@ class MyDataset(Dataset): item = args[0] func_args = args[1] text, wav_file, *_ = item - phonemes = MyDataset._load_or_generate_phoneme_sequence(wav_file, text, *func_args) + phonemes = TTSDataset._load_or_generate_phoneme_sequence(wav_file, text, *func_args) return phonemes def compute_input_seq(self, num_workers=0): @@ -202,7 +238,8 @@ class MyDataset(Dataset): for idx, item in enumerate(tqdm.tqdm(self.items)): text, *_ = item sequence = np.asarray( - text_to_sequence(text, [self.cleaners], tp=self.tp, add_blank=self.add_blank), dtype=np.int32 + text_to_sequence(text, [self.cleaners], tp=self.characters, add_blank=self.add_blank), + dtype=np.int32, ) self.items[idx][0] = sequence @@ -212,7 +249,7 @@ class MyDataset(Dataset): self.enable_eos_bos, self.cleaners, self.phoneme_language, - self.tp, + self.characters, self.add_blank, ] if self.verbose: @@ -225,7 +262,7 @@ class MyDataset(Dataset): with Pool(num_workers) as p: phonemes = list( tqdm.tqdm( - p.imap(MyDataset._phoneme_worker, [[item, func_args] for item in self.items]), + p.imap(TTSDataset._phoneme_worker, [[item, func_args] for item in self.items]), total=len(self.items), ) ) @@ -282,7 +319,7 @@ class MyDataset(Dataset): """ # Puts each data field into a tensor with outer dimension batch size - if isinstance(batch[0], collections.Mapping): + if isinstance(batch[0], collections.abc.Mapping): text_lenghts = np.array([len(d["text"]) for d in batch]) @@ -293,13 +330,18 @@ class MyDataset(Dataset): item_idxs = [batch[idx]["item_idx"] for idx in ids_sorted_decreasing] text = [batch[idx]["text"] for idx in ids_sorted_decreasing] - speaker_name = [batch[idx]["speaker_name"] for idx in ids_sorted_decreasing] - # get speaker embeddings - if self.speaker_mapping is not None: + speaker_names = [batch[idx]["speaker_name"] for idx in ids_sorted_decreasing] + # get pre-computed d-vectors + if self.d_vector_mapping is not None: wav_files_names = [batch[idx]["wav_file_name"] for idx in ids_sorted_decreasing] - speaker_embedding = [self.speaker_mapping[w]["embedding"] for w in wav_files_names] + d_vectors = [self.d_vector_mapping[w]["embedding"] for w in wav_files_names] else: - speaker_embedding = None + d_vectors = None + # get numerical speaker ids from speaker names + if self.speaker_id_mapping: + speaker_ids = [self.speaker_id_mapping[sn] for sn in speaker_names] + else: + speaker_ids = None # compute features mel = [self.ap.melspectrogram(w).astype("float32") for w in wav] @@ -327,8 +369,11 @@ class MyDataset(Dataset): mel_lengths = torch.LongTensor(mel_lengths) stop_targets = torch.FloatTensor(stop_targets) - if speaker_embedding is not None: - speaker_embedding = torch.FloatTensor(speaker_embedding) + if d_vectors is not None: + d_vectors = torch.FloatTensor(d_vectors) + + if speaker_ids is not None: + speaker_ids = torch.LongTensor(speaker_ids) # compute linear spectrogram if self.compute_linear_spec: @@ -355,13 +400,14 @@ class MyDataset(Dataset): return ( text, text_lenghts, - speaker_name, + speaker_names, linear, mel, mel_lengths, stop_targets, item_idxs, - speaker_embedding, + d_vectors, + speaker_ids, attns, ) diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index e69de29b..736d6ed4 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -0,0 +1,84 @@ +import sys +from collections import Counter +from pathlib import Path + +import numpy as np + +from TTS.tts.datasets.formatters import * +from TTS.tts.datasets.TTSDataset import TTSDataset + + +def split_dataset(items): + speakers = [item[-1] for item in items] + is_multi_speaker = len(set(speakers)) > 1 + eval_split_size = min(500, int(len(items) * 0.01)) + assert eval_split_size > 0, " [!] You do not have enough samples to train. You need at least 100 samples." + np.random.seed(0) + np.random.shuffle(items) + if is_multi_speaker: + items_eval = [] + speakers = [item[-1] for item in items] + speaker_counter = Counter(speakers) + while len(items_eval) < eval_split_size: + item_idx = np.random.randint(0, len(items)) + speaker_to_be_removed = items[item_idx][-1] + if speaker_counter[speaker_to_be_removed] > 1: + items_eval.append(items[item_idx]) + speaker_counter[speaker_to_be_removed] -= 1 + del items[item_idx] + return items_eval, items + return items[:eval_split_size], items[eval_split_size:] + + +def load_meta_data(datasets, eval_split=True, ignore_generated_eval=False): + meta_data_train_all = [] + meta_data_eval_all = [] if eval_split else None + for dataset in datasets: + name = dataset["name"] + root_path = dataset["path"] + meta_file_train = dataset["meta_file_train"] + meta_file_val = dataset["meta_file_val"] + # setup the right data processor + preprocessor = _get_preprocessor_by_name(name) + # load train set + meta_data_train = preprocessor(root_path, meta_file_train) + print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}") + # load evaluation split if set + if eval_split: + if meta_file_val: + meta_data_eval = preprocessor(root_path, meta_file_val) + meta_data_eval_all += meta_data_eval + elif not ignore_generated_eval: + meta_data_eval, meta_data_train = split_dataset(meta_data_train) + meta_data_eval_all += meta_data_eval + + meta_data_train_all += meta_data_train + # load attention masks for duration predictor training + if dataset.meta_file_attn_mask: + meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"])) + for idx, ins in enumerate(meta_data_train_all): + attn_file = meta_data[ins[1]].strip() + meta_data_train_all[idx].append(attn_file) + if meta_data_eval_all: + for idx, ins in enumerate(meta_data_eval_all): + attn_file = meta_data[ins[1]].strip() + meta_data_eval_all[idx].append(attn_file) + return meta_data_train_all, meta_data_eval_all + + +def load_attention_mask_meta_data(metafile_path): + """Load meta data file created by compute_attention_masks.py""" + with open(metafile_path, "r") as f: + lines = f.readlines() + + meta_data = [] + for line in lines: + wav_file, attn_file = line.split("|") + meta_data.append([wav_file, attn_file]) + return meta_data + + +def _get_preprocessor_by_name(name): + """Returns the respective preprocessing function.""" + thismodule = sys.modules[__name__] + return getattr(thismodule, name.lower()) diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/formatters.py similarity index 81% rename from TTS/tts/datasets/preprocess.py rename to TTS/tts/datasets/formatters.py index cff7907e..ef5299cb 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/formatters.py @@ -1,96 +1,12 @@ import os import re -import sys import xml.etree.ElementTree as ET -from collections import Counter from glob import glob from pathlib import Path from typing import List -import numpy as np from tqdm import tqdm -#################### -# UTILITIES -#################### - - -def split_dataset(items): - speakers = [item[-1] for item in items] - is_multi_speaker = len(set(speakers)) > 1 - eval_split_size = min(500, int(len(items) * 0.01)) - assert eval_split_size > 0, " [!] You do not have enough samples to train. You need at least 100 samples." - np.random.seed(0) - np.random.shuffle(items) - if is_multi_speaker: - items_eval = [] - speakers = [item[-1] for item in items] - speaker_counter = Counter(speakers) - while len(items_eval) < eval_split_size: - item_idx = np.random.randint(0, len(items)) - speaker_to_be_removed = items[item_idx][-1] - if speaker_counter[speaker_to_be_removed] > 1: - items_eval.append(items[item_idx]) - speaker_counter[speaker_to_be_removed] -= 1 - del items[item_idx] - return items_eval, items - return items[:eval_split_size], items[eval_split_size:] - - -def load_meta_data(datasets, eval_split=True, ignore_generated_eval=False): - meta_data_train_all = [] - meta_data_eval_all = [] if eval_split else None - for dataset in datasets: - name = dataset["name"] - root_path = dataset["path"] - meta_file_train = dataset["meta_file_train"] - meta_file_val = dataset["meta_file_val"] - # setup the right data processor - preprocessor = get_preprocessor_by_name(name) - # load train set - meta_data_train = preprocessor(root_path, meta_file_train) - print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}") - # load evaluation split if set - if eval_split: - if meta_file_val: - meta_data_eval = preprocessor(root_path, meta_file_val) - meta_data_eval_all += meta_data_eval - elif not ignore_generated_eval: - meta_data_eval, meta_data_train = split_dataset(meta_data_train) - meta_data_eval_all += meta_data_eval - - meta_data_train_all += meta_data_train - # load attention masks for duration predictor training - if dataset.meta_file_attn_mask: - meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"])) - for idx, ins in enumerate(meta_data_train_all): - attn_file = meta_data[ins[1]].strip() - meta_data_train_all[idx].append(attn_file) - if meta_data_eval_all: - for idx, ins in enumerate(meta_data_eval_all): - attn_file = meta_data[ins[1]].strip() - meta_data_eval_all[idx].append(attn_file) - return meta_data_train_all, meta_data_eval_all - - -def load_attention_mask_meta_data(metafile_path): - """Load meta data file created by compute_attention_masks.py""" - with open(metafile_path, "r") as f: - lines = f.readlines() - - meta_data = [] - for line in lines: - wav_file, attn_file = line.split("|") - meta_data.append([wav_file, attn_file]) - return meta_data - - -def get_preprocessor_by_name(name): - """Returns the respective preprocessing function.""" - thismodule = sys.modules[__name__] - return getattr(thismodule, name.lower()) - - ######################## # DATASETS ######################## @@ -191,6 +107,20 @@ def ljspeech(root_path, meta_file): return items +def ljspeech_test(root_path, meta_file): + """Normalizes the LJSpeech meta data file for TTS testing + https://keithito.com/LJ-Speech-Dataset/""" + txt_file = os.path.join(root_path, meta_file) + items = [] + with open(txt_file, "r", encoding="utf-8") as ttf: + for idx, line in enumerate(ttf): + cols = line.split("|") + wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav") + text = cols[1] + items.append([text, wav_file, f"ljspeech-{idx}"]) + return items + + def sam_accenture(root_path, meta_file): """Normalizes the sam-accenture meta data file to TTS format https://github.com/Sam-Accenture-Non-Binary-Voice/non-binary-voice-files""" diff --git a/TTS/tts/layers/__init__.py b/TTS/tts/layers/__init__.py index e69de29b..78f56a5d 100644 --- a/TTS/tts/layers/__init__.py +++ b/TTS/tts/layers/__init__.py @@ -0,0 +1,15 @@ +from TTS.tts.layers.losses import * + + +def setup_loss(config): + if config.model.lower() in ["tacotron", "tacotron2"]: + model = TacotronLoss(config) + elif config.model.lower() == "glow_tts": + model = GlowTTSLoss() + elif config.model.lower() == "speedy_speech": + model = SpeedySpeechLoss(config) + elif config.model.lower() == "align_tts": + model = AlignTTSLoss(config) + else: + raise ValueError(f" [!] loss for model {config.model.lower()} cannot be found.") + return model diff --git a/TTS/tts/layers/glow_tts/decoder.py b/TTS/tts/layers/glow_tts/decoder.py index 7b3f0ed1..f57c3731 100644 --- a/TTS/tts/layers/glow_tts/decoder.py +++ b/TTS/tts/layers/glow_tts/decoder.py @@ -12,7 +12,8 @@ def squeeze(x, x_mask=None, num_sqz=2): Note: each 's' is a n-dimensional vector. - [s1,s2,s3,s4,s5,s6] --> [[s1, s3, s5], [s2, s4, s6]]""" + ``[s1,s2,s3,s4,s5,s6] --> [[s1, s3, s5], [s2, s4, s6]]`` + """ b, c, t = x.size() t = (t // num_sqz) * num_sqz @@ -32,7 +33,8 @@ def unsqueeze(x, x_mask=None, num_sqz=2): Note: each 's' is a n-dimensional vector. - [[s1, s3, s5], [s2, s4, s6]] --> [[s1, s3, s5], [s2, s4, s6]]""" + ``[[s1, s3, s5], [s2, s4, s6]] --> [[s1, s3, s5], [s2, s4, s6]]`` + """ b, c, t = x.size() x_unsqz = x.view(b, num_sqz, c // num_sqz, t) @@ -47,7 +49,10 @@ def unsqueeze(x, x_mask=None, num_sqz=2): class Decoder(nn.Module): """Stack of Glow Decoder Modules. - Squeeze -> ActNorm -> InvertibleConv1x1 -> AffineCoupling -> Unsqueeze + + :: + + Squeeze -> ActNorm -> InvertibleConv1x1 -> AffineCoupling -> Unsqueeze Args: in_channels (int): channels of input tensor. @@ -106,6 +111,12 @@ class Decoder(nn.Module): ) def forward(self, x, x_mask, g=None, reverse=False): + """ + Shapes: + - x: :math:`[B, C, T]` + - x_mask: :math:`[B, 1 ,T]` + - g: :math:`[B, C]` + """ if not reverse: flows = self.flows logdet_tot = 0 diff --git a/TTS/tts/layers/glow_tts/duration_predictor.py b/TTS/tts/layers/glow_tts/duration_predictor.py index 51d1066a..e35aeb68 100644 --- a/TTS/tts/layers/glow_tts/duration_predictor.py +++ b/TTS/tts/layers/glow_tts/duration_predictor.py @@ -6,13 +6,16 @@ from ..generic.normalization import LayerNorm class DurationPredictor(nn.Module): """Glow-TTS duration prediction model. - [2 x (conv1d_kxk -> relu -> layer_norm -> dropout)] -> conv1d_1x1 -> durs - Args: - in_channels ([type]): [description] - hidden_channels ([type]): [description] - kernel_size ([type]): [description] - dropout_p ([type]): [description] + :: + + [2 x (conv1d_kxk -> relu -> layer_norm -> dropout)] -> conv1d_1x1 -> durs + + Args: + in_channels (int): Number of channels of the input tensor. + hidden_channels (int): Number of hidden channels of the network. + kernel_size (int): Kernel size for the conv layers. + dropout_p (float): Dropout rate used after each conv layer. """ def __init__(self, in_channels, hidden_channels, kernel_size, dropout_p): @@ -34,11 +37,8 @@ class DurationPredictor(nn.Module): def forward(self, x, x_mask): """ Shapes: - x: [B, C, T] - x_mask: [B, 1, T] - - Returns: - [type]: [description] + - x: :math:`[B, C, T]` + - x_mask: :math:`[B, 1, T]` """ x = self.conv_1(x * x_mask) x = torch.relu(x) diff --git a/TTS/tts/layers/glow_tts/encoder.py b/TTS/tts/layers/glow_tts/encoder.py index 48bb3008..f3eb4655 100644 --- a/TTS/tts/layers/glow_tts/encoder.py +++ b/TTS/tts/layers/glow_tts/encoder.py @@ -9,19 +9,22 @@ from TTS.tts.layers.generic.time_depth_sep_conv import TimeDepthSeparableConvBlo from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor from TTS.tts.layers.glow_tts.glow import ResidualConv1dLayerNormBlock from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask class Encoder(nn.Module): """Glow-TTS encoder module. - embedding -> -> encoder_module -> --> proj_mean - | - |-> proj_var - | - |-> concat -> duration_predictor - ↑ - speaker_embed + :: + + embedding -> -> encoder_module -> --> proj_mean + | + |-> proj_var + | + |-> concat -> duration_predictor + ↑ + speaker_embed + Args: num_chars (int): number of characters. out_channels (int): number of output channels. @@ -36,7 +39,8 @@ class Encoder(nn.Module): Shapes: - input: (B, T, C) - Notes: + :: + suggested encoder params... for encoder_type == 'rel_pos_transformer' @@ -139,9 +143,9 @@ class Encoder(nn.Module): def forward(self, x, x_lengths, g=None): """ Shapes: - x: [B, C, T] - x_lengths: [B] - g (optional): [B, 1, T] + - x: :math:`[B, C, T]` + - x_lengths: :math:`[B]` + - g (optional): :math:`[B, 1, T]` """ # embedding layer # [B ,T, D] diff --git a/TTS/tts/layers/glow_tts/glow.py b/TTS/tts/layers/glow_tts/glow.py index 18c491e3..33036537 100644 --- a/TTS/tts/layers/glow_tts/glow.py +++ b/TTS/tts/layers/glow_tts/glow.py @@ -1,3 +1,5 @@ +from distutils.version import LooseVersion + import torch from torch import nn from torch.nn import functional as F @@ -8,21 +10,24 @@ from ..generic.normalization import LayerNorm class ResidualConv1dLayerNormBlock(nn.Module): + """Conv1d with Layer Normalization and residual connection as in GlowTTS paper. + https://arxiv.org/pdf/1811.00002.pdf + + :: + + x |-> conv1d -> layer_norm -> relu -> dropout -> + -> o + |---------------> conv1d_1x1 -----------------------| + + Args: + in_channels (int): number of input tensor channels. + hidden_channels (int): number of inner layer channels. + out_channels (int): number of output tensor channels. + kernel_size (int): kernel size of conv1d filter. + num_layers (int): number of blocks. + dropout_p (float): dropout rate for each block. + """ + def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, num_layers, dropout_p): - """Conv1d with Layer Normalization and residual connection as in GlowTTS paper. - https://arxiv.org/pdf/1811.00002.pdf - - x |-> conv1d -> layer_norm -> relu -> dropout -> + -> o - |---------------> conv1d_1x1 -----------------------| - - Args: - in_channels (int): number of input tensor channels. - hidden_channels (int): number of inner layer channels. - out_channels (int): number of output tensor channels. - kernel_size (int): kernel size of conv1d filter. - num_layers (int): number of blocks. - dropout_p (float): dropout rate for each block. - """ super().__init__() self.in_channels = in_channels self.hidden_channels = hidden_channels @@ -49,6 +54,11 @@ class ResidualConv1dLayerNormBlock(nn.Module): self.proj.bias.data.zero_() def forward(self, x, x_mask): + """ + Shapes: + - x: :math:`[B, C, T]` + - x_mask: :math:`[B, 1, T]` + """ x_res = x for i in range(self.num_layers): x = self.conv_layers[i](x * x_mask) @@ -81,7 +91,11 @@ class InvConvNear(nn.Module): self.no_jacobian = no_jacobian self.weight_inv = None - w_init = torch.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_())[0] + if LooseVersion(torch.__version__) < LooseVersion("1.9"): + w_init = torch.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_())[0] + else: + w_init = torch.linalg.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_(), "complete")[0] + if torch.det(w_init) < 0: w_init[:, 0] = -1 * w_init[:, 0] self.weight = nn.Parameter(w_init) @@ -89,8 +103,8 @@ class InvConvNear(nn.Module): def forward(self, x, x_mask=None, reverse=False, **kwargs): # pylint: disable=unused-argument """ Shapes: - x: B x C x T - x_mask: B x 1 x T + - x: :math:`[B, C, T]` + - x_mask: :math:`[B, 1, T]` """ b, c, t = x.size() @@ -133,10 +147,12 @@ class CouplingBlock(nn.Module): """Glow Affine Coupling block as in GlowTTS paper. https://arxiv.org/pdf/1811.00002.pdf - x --> x0 -> conv1d -> wavenet -> conv1d --> t, s -> concat(s*x1 + t, x0) -> o - '-> x1 - - - - - - - - - - - - - - - - - - - - - - - - - ^ + :: - Args: + x --> x0 -> conv1d -> wavenet -> conv1d --> t, s -> concat(s*x1 + t, x0) -> o + '-> x1 - - - - - - - - - - - - - - - - - - - - - - - - - ^ + + Args: in_channels (int): number of input tensor channels. hidden_channels (int): number of hidden channels. kernel_size (int): WaveNet filter kernel size. @@ -146,8 +162,8 @@ class CouplingBlock(nn.Module): dropout_p (int): wavenet dropout rate. sigmoid_scale (bool): enable/disable sigmoid scaling for output scale. - Note: - It does not use conditional inputs differently from WaveGlow. + Note: + It does not use the conditional inputs differently from WaveGlow. """ def __init__( @@ -187,9 +203,9 @@ class CouplingBlock(nn.Module): def forward(self, x, x_mask=None, reverse=False, g=None, **kwargs): # pylint: disable=unused-argument """ Shapes: - x: B x C x T - x_mask: B x 1 x T - g: B x C x 1 + - x: :math:`[B, C, T]` + - x_mask: :math:`[B, 1, T]` + - g: :math:`[B, C, 1]` """ if x_mask is None: x_mask = 1 diff --git a/TTS/tts/layers/glow_tts/monotonic_align/__init__.py b/TTS/tts/layers/glow_tts/monotonic_align/__init__.py index 7be124f4..5cbfd8fc 100644 --- a/TTS/tts/layers/glow_tts/monotonic_align/__init__.py +++ b/TTS/tts/layers/glow_tts/monotonic_align/__init__.py @@ -2,7 +2,7 @@ import numpy as np import torch from torch.nn import functional as F -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask try: # TODO: fix pypi cython installation problem. diff --git a/TTS/tts/layers/glow_tts/transformer.py b/TTS/tts/layers/glow_tts/transformer.py index 1a67d0ba..92cace78 100644 --- a/TTS/tts/layers/glow_tts/transformer.py +++ b/TTS/tts/layers/glow_tts/transformer.py @@ -17,16 +17,18 @@ class RelativePositionMultiHeadAttention(nn.Module): Note: Example with relative attention window size 2 - input = [a, b, c, d, e] - rel_attn_embeddings = [e(t-2), e(t-1), e(t+1), e(t+2)] + + - input = [a, b, c, d, e] + - rel_attn_embeddings = [e(t-2), e(t-1), e(t+1), e(t+2)] So it learns 4 embedding vectors (in total 8) separately for key and value vectors. Considering the input c - e(t-2) corresponds to c -> a - e(t-2) corresponds to c -> b - e(t-2) corresponds to c -> d - e(t-2) corresponds to c -> e + + - e(t-2) corresponds to c -> a + - e(t-2) corresponds to c -> b + - e(t-2) corresponds to c -> d + - e(t-2) corresponds to c -> e These embeddings are shared among different time steps. So input a, b, d and e also uses the same embeddings. @@ -106,6 +108,12 @@ class RelativePositionMultiHeadAttention(nn.Module): nn.init.xavier_uniform_(self.conv_v.weight) def forward(self, x, c, attn_mask=None): + """ + Shapes: + - x: :math:`[B, C, T]` + - c: :math:`[B, C, T]` + - attn_mask: :math:`[B, 1, T, T]` + """ q = self.conv_q(x) k = self.conv_k(c) v = self.conv_v(c) @@ -163,9 +171,9 @@ class RelativePositionMultiHeadAttention(nn.Module): re (Tensor): relative value embedding vector. (a_(i,j)^V) Shapes: - p_attn: [B, H, T, V] - re: [H or 1, V, D] - logits: [B, H, T, D] + -p_attn: :math:`[B, H, T, V]` + -re: :math:`[H or 1, V, D]` + -logits: :math:`[B, H, T, D]` """ logits = torch.matmul(p_attn, re.unsqueeze(0)) return logits @@ -178,9 +186,9 @@ class RelativePositionMultiHeadAttention(nn.Module): re (Tensor): relative key embedding vector. (a_(i,j)^K) Shapes: - query: [B, H, T, D] - re: [H or 1, V, D] - logits: [B, H, T, V] + - query: :math:`[B, H, T, D]` + - re: :math:`[H or 1, V, D]` + - logits: :math:`[B, H, T, V]` """ # logits = torch.einsum('bhld, kmd -> bhlm', [query, re.to(query.dtype)]) logits = torch.matmul(query, re.unsqueeze(0).transpose(-2, -1)) @@ -202,10 +210,10 @@ class RelativePositionMultiHeadAttention(nn.Module): @staticmethod def _relative_position_to_absolute_position(x): """Converts tensor from relative to absolute indexing for local attention. - Args: - x: [B, D, length, 2 * length - 1] + Shapes: + x: :math:`[B, C, T, 2 * T - 1]` Returns: - A Tensor of shape [B, D, length, length] + A Tensor of shape :math:`[B, C, T, T]` """ batch, heads, length, _ = x.size() # Pad to shift from relative to absolute indexing. @@ -220,8 +228,9 @@ class RelativePositionMultiHeadAttention(nn.Module): @staticmethod def _absolute_position_to_relative_position(x): """ - x: [B, H, T, T] - ret: [B, H, T, 2*T-1] + Shapes: + - x: :math:`[B, C, T, T]` + - ret: :math:`[B, C, T, 2*T-1]` """ batch, heads, length, _ = x.size() # padd along column @@ -239,7 +248,7 @@ class RelativePositionMultiHeadAttention(nn.Module): Args: length (int): an integer scalar. Returns: - a Tensor with shape [1, 1, length, length] + a Tensor with shape :math:`[1, 1, T, T]` """ # L r = torch.arange(length, dtype=torch.float32) @@ -362,8 +371,8 @@ class RelativePositionTransformer(nn.Module): def forward(self, x, x_mask): """ Shapes: - x: [B, C, T] - x_mask: [B, 1, T] + - x: :math:`[B, C, T]` + - x_mask: :math:`[B, 1, T]` """ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) for i in range(self.num_layers): diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 729a21af..86d34c30 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -5,7 +5,7 @@ import torch from torch import nn from torch.nn import functional -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask from TTS.tts.utils.ssim import ssim @@ -462,13 +462,12 @@ class MDNLoss(nn.Module): class AlignTTSLoss(nn.Module): """Modified AlignTTS Loss. - Computes following losses + Computes - L1 and SSIM losses from output spectrograms. - Huber loss for duration predictor. - MDNLoss for Mixture of Density Network. - All the losses are aggregated by a weighted sum with the loss alphas. - Alphas can be scheduled based on number of steps. + All loss values are aggregated by a weighted sum of the alpha values. Args: c (dict): TTS model configuration. @@ -487,9 +486,9 @@ class AlignTTSLoss(nn.Module): self.mdn_alpha = c.mdn_alpha def forward( - self, logp, decoder_output, decoder_target, decoder_output_lens, dur_output, dur_target, input_lens, step, phase + self, logp, decoder_output, decoder_target, decoder_output_lens, dur_output, dur_target, input_lens, phase ): - ssim_alpha, dur_loss_alpha, spec_loss_alpha, mdn_alpha = self.set_alphas(step) + # ssim_alpha, dur_loss_alpha, spec_loss_alpha, mdn_alpha = self.set_alphas(step) spec_loss, ssim_loss, dur_loss, mdn_loss = 0, 0, 0, 0 if phase == 0: mdn_loss = self.mdn_loss(logp, input_lens, decoder_output_lens) @@ -507,36 +506,10 @@ class AlignTTSLoss(nn.Module): spec_loss = self.spec_loss(decoder_output, decoder_target, decoder_output_lens) ssim_loss = self.ssim(decoder_output, decoder_target, decoder_output_lens) dur_loss = self.dur_loss(dur_output.unsqueeze(2), dur_target.unsqueeze(2), input_lens) - loss = spec_loss_alpha * spec_loss + ssim_alpha * ssim_loss + dur_loss_alpha * dur_loss + mdn_alpha * mdn_loss + loss = ( + self.spec_loss_alpha * spec_loss + + self.ssim_alpha * ssim_loss + + self.dur_loss_alpha * dur_loss + + self.mdn_alpha * mdn_loss + ) return {"loss": loss, "loss_l1": spec_loss, "loss_ssim": ssim_loss, "loss_dur": dur_loss, "mdn_loss": mdn_loss} - - @staticmethod - def _set_alpha(step, alpha_settings): - """Set the loss alpha wrt number of steps. - Return the corresponding value if no schedule is set. - - Example: - Setting a alpha schedule. - if ```alpha_settings``` is ```[[0, 1], [10000, 0.1]]``` then ```return_alpha == 1``` until 10k steps, then set to 0.1. - if ```alpha_settings``` is a constant value then ```return_alpha``` is set to that constant. - - Args: - step (int): number of training steps. - alpha_settings (int or list): constant alpha value or a list defining the schedule as explained above. - """ - return_alpha = None - if isinstance(alpha_settings, list): - for key, alpha in alpha_settings: - if key < step: - return_alpha = alpha - elif isinstance(alpha_settings, (float, int)): - return_alpha = alpha_settings - return return_alpha - - def set_alphas(self, step): - """Set the alpha values for all the loss functions""" - ssim_alpha = self._set_alpha(step, self.ssim_alpha) - dur_loss_alpha = self._set_alpha(step, self.dur_loss_alpha) - spec_loss_alpha = self._set_alpha(step, self.spec_loss_alpha) - mdn_alpha = self._set_alpha(step, self.mdn_alpha) - return ssim_alpha, dur_loss_alpha, spec_loss_alpha, mdn_alpha diff --git a/TTS/tts/layers/tacotron/gst_layers.py b/TTS/tts/layers/tacotron/gst_layers.py index e2784e5d..02154093 100644 --- a/TTS/tts/layers/tacotron/gst_layers.py +++ b/TTS/tts/layers/tacotron/gst_layers.py @@ -8,10 +8,10 @@ class GST(nn.Module): See https://arxiv.org/pdf/1803.09017""" - def __init__(self, num_mel, num_heads, num_style_tokens, gst_embedding_dim, speaker_embedding_dim=None): + def __init__(self, num_mel, num_heads, num_style_tokens, gst_embedding_dim, d_vector_dim=None): super().__init__() self.encoder = ReferenceEncoder(num_mel, gst_embedding_dim) - self.style_token_layer = StyleTokenLayer(num_heads, num_style_tokens, gst_embedding_dim, speaker_embedding_dim) + self.style_token_layer = StyleTokenLayer(num_heads, num_style_tokens, gst_embedding_dim, d_vector_dim) def forward(self, inputs, speaker_embedding=None): enc_out = self.encoder(inputs) @@ -83,13 +83,13 @@ class ReferenceEncoder(nn.Module): class StyleTokenLayer(nn.Module): """NN Module attending to style tokens based on prosody encodings.""" - def __init__(self, num_heads, num_style_tokens, embedding_dim, speaker_embedding_dim=None): + def __init__(self, num_heads, num_style_tokens, embedding_dim, d_vector_dim=None): super().__init__() self.query_dim = embedding_dim // 2 - if speaker_embedding_dim: - self.query_dim += speaker_embedding_dim + if d_vector_dim: + self.query_dim += d_vector_dim self.key_dim = embedding_dim // num_heads self.style_tokens = nn.Parameter(torch.FloatTensor(num_style_tokens, self.key_dim)) diff --git a/TTS/tts/layers/tacotron/tacotron.py b/TTS/tts/layers/tacotron/tacotron.py index dc38173f..47b5ea7e 100644 --- a/TTS/tts/layers/tacotron/tacotron.py +++ b/TTS/tts/layers/tacotron/tacotron.py @@ -1,4 +1,6 @@ # coding: utf-8 +# adapted from https://github.com/r9y9/tacotron_pytorch + import torch from torch import nn @@ -266,7 +268,8 @@ class Decoder(nn.Module): location_attn (bool): if true, use location sensitive attention. attn_K (int): number of attention heads for GravesAttention. separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow. - speaker_embedding_dim (int): size of speaker embedding vector, for multi-speaker training. + d_vector_dim (int): size of speaker embedding vector, for multi-speaker training. + max_decoder_steps (int): Maximum number of steps allowed for the decoder. Defaults to 500. """ # Pylint gets confused by PyTorch conventions here @@ -289,12 +292,13 @@ class Decoder(nn.Module): location_attn, attn_K, separate_stopnet, + max_decoder_steps, ): super().__init__() self.r_init = r self.r = r self.in_channels = in_channels - self.max_decoder_steps = 500 + self.max_decoder_steps = max_decoder_steps self.use_memory_queue = memory_size > 0 self.memory_size = memory_size if memory_size > 0 else r self.frame_channels = frame_channels diff --git a/TTS/tts/layers/tacotron/tacotron2.py b/TTS/tts/layers/tacotron/tacotron2.py index aeca8953..9c33623e 100644 --- a/TTS/tts/layers/tacotron/tacotron2.py +++ b/TTS/tts/layers/tacotron/tacotron2.py @@ -135,6 +135,7 @@ class Decoder(nn.Module): location_attn (bool): if true, use location sensitive attention. attn_K (int): number of attention heads for GravesAttention. separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow. + max_decoder_steps (int): Maximum number of steps allowed for the decoder. Defaults to 10000. """ # Pylint gets confused by PyTorch conventions here @@ -155,6 +156,7 @@ class Decoder(nn.Module): location_attn, attn_K, separate_stopnet, + max_decoder_steps, ): super().__init__() self.frame_channels = frame_channels @@ -162,7 +164,7 @@ class Decoder(nn.Module): self.r = r self.encoder_embedding_dim = in_channels self.separate_stopnet = separate_stopnet - self.max_decoder_steps = 1000 + self.max_decoder_steps = max_decoder_steps self.stop_threshold = 0.5 # model dimensions @@ -355,7 +357,7 @@ class Decoder(nn.Module): if stop_token > self.stop_threshold and t > inputs.shape[0] // 2: break if len(outputs) == self.max_decoder_steps: - print(" | > Decoder stopped with 'max_decoder_steps") + print(f" > Decoder stopped with `max_decoder_steps` {self.max_decoder_steps}") break memory = self._update_memory(decoder_output) diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py index e69de29b..c6390beb 100644 --- a/TTS/tts/models/__init__.py +++ b/TTS/tts/models/__init__.py @@ -0,0 +1,42 @@ +from TTS.tts.utils.text.symbols import make_symbols, parse_symbols +from TTS.utils.generic_utils import find_module + + +def setup_model(config): + print(" > Using model: {}".format(config.model)) + + MyModel = find_module("TTS.tts.models", config.model.lower()) + # define set of characters used by the model + if config.characters is not None: + # set characters from config + symbols, phonemes = make_symbols(**config.characters.to_dict()) # pylint: disable=redefined-outer-name + else: + from TTS.tts.utils.text.symbols import phonemes, symbols # pylint: disable=import-outside-toplevel + + # use default characters and assign them to config + config.characters = parse_symbols() + num_chars = len(phonemes) if config.use_phonemes else len(symbols) + # consider special `blank` character if `add_blank` is set True + num_chars = num_chars + getattr(config, "add_blank", False) + config.num_chars = num_chars + # compatibility fix + if "model_params" in config: + config.model_params.num_chars = num_chars + if "model_args" in config: + config.model_args.num_chars = num_chars + model = MyModel(config) + return model + + +# TODO; class registery +# def import_models(models_dir, namespace): +# for file in os.listdir(models_dir): +# path = os.path.join(models_dir, file) +# if not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)): +# model_name = file[: file.find(".py")] if file.endswith(".py") else file +# importlib.import_module(namespace + "." + model_name) +# +# +## automatically import any Python files in the models/ directory +# models_dir = os.path.dirname(__file__) +# import_models(models_dir, "TTS.tts.models") diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index e097ac50..879ecae4 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -1,5 +1,9 @@ +from dataclasses import dataclass, field +from typing import Dict, Tuple + import torch import torch.nn as nn +from coqpit import Coqpit from TTS.tts.layers.align_tts.mdn import MDNBlock from TTS.tts.layers.feed_forward.decoder import Decoder @@ -7,32 +11,16 @@ from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.models.base_tts import BaseTTS +from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor -class AlignTTS(nn.Module): - """AlignTTS with modified duration predictor. - https://arxiv.org/pdf/2003.01950.pdf - - Encoder -> DurationPredictor -> Decoder - - AlignTTS's Abstract - Targeting at both high efficiency and performance, we propose AlignTTS to predict the - mel-spectrum in parallel. AlignTTS is based on a Feed-Forward Transformer which generates mel-spectrum from a - sequence of characters, and the duration of each character is determined by a duration predictor.Instead of - adopting the attention mechanism in Transformer TTS to align text to mel-spectrum, the alignment loss is presented - to consider all possible alignments in training by use of dynamic programming. Experiments on the LJSpeech dataset s - how that our model achieves not only state-of-the-art performance which outperforms Transformer TTS by 0.03 in mean - option score (MOS), but also a high efficiency which is more than 50 times faster than real-time. - - Note: - Original model uses a separate character embedding layer for duration predictor. However, it causes the - duration predictor to overfit and prevents learning higher level interactions among characters. Therefore, - we predict durations based on encoder outputs which has higher level information about input characters. This - enables training without phases as in the original paper. - - Original model uses Transormers in encoder and decoder layers. However, here you can set the architecture - differently based on your requirements using ```encoder_type``` and ```decoder_type``` parameters. - +@dataclass +class AlignTTSArgs(Coqpit): + """ Args: num_chars (int): number of unique input to characters @@ -60,42 +48,102 @@ class AlignTTS(nn.Module): number of channels in speaker embedding vectors. Defaults to 0. """ + num_chars: int = None + out_channels: int = 80 + hidden_channels: int = 256 + hidden_channels_dp: int = 256 + encoder_type: str = "fftransformer" + encoder_params: dict = field( + default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1} + ) + decoder_type: str = "fftransformer" + decoder_params: dict = field( + default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1} + ) + length_scale: float = 1.0 + num_speakers: int = 0 + use_speaker_embedding: bool = False + use_d_vector_file: bool = False + d_vector_dim: int = 0 + + +class AlignTTS(BaseTTS): + """AlignTTS with modified duration predictor. + https://arxiv.org/pdf/2003.01950.pdf + + Encoder -> DurationPredictor -> Decoder + + Check :class:`AlignTTSArgs` for the class arguments. + + Paper Abstract: + Targeting at both high efficiency and performance, we propose AlignTTS to predict the + mel-spectrum in parallel. AlignTTS is based on a Feed-Forward Transformer which generates mel-spectrum from a + sequence of characters, and the duration of each character is determined by a duration predictor.Instead of + adopting the attention mechanism in Transformer TTS to align text to mel-spectrum, the alignment loss is presented + to consider all possible alignments in training by use of dynamic programming. Experiments on the LJSpeech dataset s + how that our model achieves not only state-of-the-art performance which outperforms Transformer TTS by 0.03 in mean + option score (MOS), but also a high efficiency which is more than 50 times faster than real-time. + + Note: + Original model uses a separate character embedding layer for duration predictor. However, it causes the + duration predictor to overfit and prevents learning higher level interactions among characters. Therefore, + we predict durations based on encoder outputs which has higher level information about input characters. This + enables training without phases as in the original paper. + + Original model uses Transormers in encoder and decoder layers. However, here you can set the architecture + differently based on your requirements using ```encoder_type``` and ```decoder_type``` parameters. + + Examples: + >>> from TTS.tts.configs import AlignTTSConfig + >>> config = AlignTTSConfig() + >>> model = AlignTTS(config) + + """ + # pylint: disable=dangerous-default-value - def __init__( - self, - num_chars, - out_channels, - hidden_channels=256, - hidden_channels_dp=256, - encoder_type="fftransformer", - encoder_params={"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}, - decoder_type="fftransformer", - decoder_params={"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}, - length_scale=1, - num_speakers=0, - external_c=False, - c_in_channels=0, - ): + def __init__(self, config: Coqpit): super().__init__() - self.length_scale = float(length_scale) if isinstance(length_scale, int) else length_scale - self.emb = nn.Embedding(num_chars, hidden_channels) - self.pos_encoder = PositionalEncoding(hidden_channels) - self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, encoder_params, c_in_channels) - self.decoder = Decoder(out_channels, hidden_channels, decoder_type, decoder_params) - self.duration_predictor = DurationPredictor(hidden_channels_dp) + self.config = config + self.phase = -1 + self.length_scale = ( + float(config.model_args.length_scale) + if isinstance(config.model_args.length_scale, int) + else config.model_args.length_scale + ) - self.mod_layer = nn.Conv1d(hidden_channels, hidden_channels, 1) - self.mdn_block = MDNBlock(hidden_channels, 2 * out_channels) + if not self.config.model_args.num_chars: + _, self.config, num_chars = self.get_characters(config) + self.config.model_args.num_chars = num_chars - if num_speakers > 1 and not external_c: - # speaker embedding layer - self.emb_g = nn.Embedding(num_speakers, c_in_channels) - nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) + self.emb = nn.Embedding(self.config.model_args.num_chars, self.config.model_args.hidden_channels) - if c_in_channels > 0 and c_in_channels != hidden_channels: - self.proj_g = nn.Conv1d(c_in_channels, hidden_channels, 1) + self.embedded_speaker_dim = 0 + self.init_multispeaker(config) + + self.pos_encoder = PositionalEncoding(config.model_args.hidden_channels) + self.encoder = Encoder( + config.model_args.hidden_channels, + config.model_args.hidden_channels, + config.model_args.encoder_type, + config.model_args.encoder_params, + self.embedded_speaker_dim, + ) + self.decoder = Decoder( + config.model_args.out_channels, + config.model_args.hidden_channels, + config.model_args.decoder_type, + config.model_args.decoder_params, + ) + self.duration_predictor = DurationPredictor(config.model_args.hidden_channels_dp) + + self.mod_layer = nn.Conv1d(config.model_args.hidden_channels, config.model_args.hidden_channels, 1) + + self.mdn_block = MDNBlock(config.model_args.hidden_channels, 2 * config.model_args.out_channels) + + if self.embedded_speaker_dim > 0 and self.embedded_speaker_dim != config.model_args.hidden_channels: + self.proj_g = nn.Conv1d(self.embedded_speaker_dim, config.model_args.hidden_channels, 1) @staticmethod def compute_log_probs(mu, log_sigma, y): @@ -129,15 +177,15 @@ class AlignTTS(nn.Module): """Generate attention alignment map from durations and expand encoder outputs - Example: - encoder output: [a,b,c,d] - durations: [1, 3, 2, 1] + Examples:: + - encoder output: [a,b,c,d] + - durations: [1, 3, 2, 1] - expanded: [a, b, b, b, c, c, d] - attention map: [[0, 0, 0, 0, 0, 0, 1], - [0, 0, 0, 0, 1, 1, 0], - [0, 1, 1, 1, 0, 0, 0], - [1, 0, 0, 0, 0, 0, 0]] + - expanded: [a, b, b, b, c, c, d] + - attention map: [[0, 0, 0, 0, 0, 0, 1], + [0, 0, 0, 0, 1, 1, 0], + [0, 1, 1, 1, 0, 0, 0], + [1, 0, 0, 0, 0, 0, 0]] """ attn = self.convert_dr_to_align(dr, x_mask, y_mask) o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2), en.transpose(1, 2)).transpose(1, 2) @@ -159,11 +207,12 @@ class AlignTTS(nn.Module): # project g to decoder dim. if hasattr(self, "proj_g"): g = self.proj_g(g) + return x + g def _forward_encoder(self, x, x_lengths, g=None): if hasattr(self, "emb_g"): - g = nn.functional.normalize(self.emb_g(g)) # [B, C, 1] + g = nn.functional.normalize(self.speaker_embedding(g)) # [B, C, 1] if g is not None: g = g.unsqueeze(-1) @@ -207,15 +256,19 @@ class AlignTTS(nn.Module): dr_mas, logp = self.compute_align_path(mu, log_sigma, y, x_mask, y_mask) return dr_mas, mu, log_sigma, logp - def forward(self, x, x_lengths, y, y_lengths, phase=None, g=None): # pylint: disable=unused-argument + def forward( + self, x, x_lengths, y, y_lengths, aux_input={"d_vectors": None}, phase=None + ): # pylint: disable=unused-argument """ Shapes: - x: [B, T_max] - x_lengths: [B] - y_lengths: [B] - dr: [B, T_max] - g: [B, C] + - x: :math:`[B, T_max]` + - x_lengths: :math:`[B]` + - y_lengths: :math:`[B]` + - dr: :math:`[B, T_max]` + - g: :math:`[B, C]` """ + y = y.transpose(1, 2) + g = aux_input["d_vectors"] if "d_vectors" in aux_input else None o_de, o_dr_log, dr_mas_log, attn, mu, log_sigma, logp = None, None, None, None, None, None, None if phase == 0: # train encoder and MDN @@ -247,16 +300,27 @@ class AlignTTS(nn.Module): o_de, attn = self._forward_decoder(o_en, o_en_dp, dr_mas, x_mask, y_lengths, g=g) o_dr_log = o_dr_log.squeeze(1) dr_mas_log = torch.log(dr_mas + 1).squeeze(1) - return o_de, o_dr_log, dr_mas_log, attn, mu, log_sigma, logp + outputs = { + "model_outputs": o_de.transpose(1, 2), + "alignments": attn, + "durations_log": o_dr_log, + "durations_mas_log": dr_mas_log, + "mu": mu, + "log_sigma": log_sigma, + "logp": logp, + } + return outputs @torch.no_grad() - def inference(self, x, x_lengths, g=None): # pylint: disable=unused-argument + def inference(self, x, aux_input={"d_vectors": None}): # pylint: disable=unused-argument """ Shapes: - x: [B, T_max] - x_lengths: [B] - g: [B, C] + - x: :math:`[B, T_max]` + - x_lengths: :math:`[B]` + - g: :math:`[B, C]` """ + g = aux_input["d_vectors"] if "d_vectors" in aux_input else None + x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # pad input to prevent dropping the last word # x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0) o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) @@ -266,7 +330,61 @@ class AlignTTS(nn.Module): o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1) y_lengths = o_dr.sum(1) o_de, attn = self._forward_decoder(o_en, o_en_dp, o_dr, x_mask, y_lengths, g=g) - return o_de, attn + outputs = {"model_outputs": o_de.transpose(1, 2), "alignments": attn} + return outputs + + def train_step(self, batch: dict, criterion: nn.Module): + text_input = batch["text_input"] + text_lengths = batch["text_lengths"] + mel_input = batch["mel_input"] + mel_lengths = batch["mel_lengths"] + d_vectors = batch["d_vectors"] + speaker_ids = batch["speaker_ids"] + + aux_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids} + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input, self.phase) + loss_dict = criterion( + outputs["logp"], + outputs["model_outputs"], + mel_input, + mel_lengths, + outputs["durations_log"], + outputs["durations_mas_log"], + text_lengths, + phase=self.phase, + ) + + # compute alignment error (the lower the better ) + align_error = 1 - alignment_diagonal_score(outputs["alignments"], binary=True) + loss_dict["align_error"] = align_error + return outputs, loss_dict + + def train_log( + self, ap: AudioProcessor, batch: dict, outputs: dict + ) -> Tuple[Dict, Dict]: # pylint: disable=no-self-use + model_outputs = outputs["model_outputs"] + alignments = outputs["alignments"] + mel_input = batch["mel_input"] + + pred_spec = model_outputs[0].data.cpu().numpy() + gt_spec = mel_input[0].data.cpu().numpy() + align_img = alignments[0].data.cpu().numpy() + + figures = { + "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), + "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), + "alignment": plot_alignment(align_img, output_fig=False), + } + + # Sample audio + train_audio = ap.inv_melspectrogram(pred_spec.T) + return figures, {"audio": train_audio} + + def eval_step(self, batch: dict, criterion: nn.Module): + return self.train_step(batch, criterion) + + def eval_log(self, ap: AudioProcessor, batch: dict, outputs: dict): + return self.train_log(ap, batch, outputs) def load_checkpoint( self, config, checkpoint_path, eval=False @@ -276,3 +394,29 @@ class AlignTTS(nn.Module): if eval: self.eval() assert not self.training + + def get_criterion(self): + from TTS.tts.layers.losses import AlignTTSLoss # pylint: disable=import-outside-toplevel + + return AlignTTSLoss(self.config) + + @staticmethod + def _set_phase(config, global_step): + """Decide AlignTTS training phase""" + if isinstance(config.phase_start_steps, list): + vals = [i < global_step for i in config.phase_start_steps] + if not True in vals: + phase = 0 + else: + phase = ( + len(config.phase_start_steps) + - [i < global_step for i in config.phase_start_steps][::-1].index(True) + - 1 + ) + else: + phase = None + return phase + + def on_epoch_start(self, trainer): + """Set AlignTTS training phase on epoch start.""" + self.phase = self._set_phase(trainer.config, trainer.total_steps_done) diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py new file mode 100644 index 00000000..b7056e06 --- /dev/null +++ b/TTS/tts/models/base_tacotron.py @@ -0,0 +1,284 @@ +import copy +from abc import abstractmethod +from dataclasses import dataclass +from typing import Dict, List + +import torch +from coqpit import MISSING, Coqpit +from torch import nn + +from TTS.tts.layers.losses import TacotronLoss +from TTS.tts.models.base_tts import BaseTTS +from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager +from TTS.tts.utils.text import make_symbols +from TTS.utils.generic_utils import format_aux_input +from TTS.utils.training import gradual_training_scheduler + + +@dataclass +class BaseTacotronArgs(Coqpit): + """TODO: update Tacotron configs using it""" + + num_chars: int = MISSING + num_speakers: int = MISSING + r: int = MISSING + out_channels: int = 80 + decoder_output_dim: int = 80 + attn_type: str = "original" + attn_win: bool = False + attn_norm: str = "softmax" + prenet_type: str = "original" + prenet_dropout: bool = True + prenet_dropout_at_inference: bool = False + forward_attn: bool = False + trans_agent: bool = False + forward_attn_mask: bool = False + location_attn: bool = True + attn_K: int = 5 + separate_stopnet: bool = True + bidirectional_decoder: bool = False + double_decoder_consistency: bool = False + ddc_r: int = None + encoder_in_features: int = 512 + decoder_in_features: int = 512 + d_vector_dim: int = None + use_gst: bool = False + gst: bool = None + gradual_training: bool = None + + +class BaseTacotron(BaseTTS): + def __init__(self, config: Coqpit): + """Abstract Tacotron class""" + super().__init__() + + for key in config: + setattr(self, key, config[key]) + + # layers + self.embedding = None + self.encoder = None + self.decoder = None + self.postnet = None + + # init tensors + self.embedded_speakers = None + self.embedded_speakers_projected = None + + # global style token + if self.gst and self.use_gst: + self.decoder_in_features += self.gst.gst_embedding_dim # add gst embedding dim + self.gst_layer = None + + # additional layers + self.decoder_backward = None + self.coarse_decoder = None + + # init multi-speaker layers + self.init_multispeaker(config) + + @staticmethod + def _format_aux_input(aux_input: Dict) -> Dict: + return format_aux_input({"d_vectors": None, "speaker_ids": None}, aux_input) + + ############################# + # INIT FUNCTIONS + ############################# + + def _init_states(self): + self.embedded_speakers = None + self.embedded_speakers_projected = None + + def _init_backward_decoder(self): + self.decoder_backward = copy.deepcopy(self.decoder) + + def _init_coarse_decoder(self): + self.coarse_decoder = copy.deepcopy(self.decoder) + self.coarse_decoder.r_init = self.ddc_r + self.coarse_decoder.set_r(self.ddc_r) + + ############################# + # CORE FUNCTIONS + ############################# + + @abstractmethod + def forward(self): + pass + + @abstractmethod + def inference(self): + pass + + def load_checkpoint( + self, config, checkpoint_path, eval=False + ): # pylint: disable=unused-argument, redefined-builtin + state = torch.load(checkpoint_path, map_location=torch.device("cpu")) + self.load_state_dict(state["model"]) + if "r" in state: + self.decoder.set_r(state["r"]) + else: + self.decoder.set_r(state["config"]["r"]) + if eval: + self.eval() + assert not self.training + + def get_criterion(self) -> nn.Module: + return TacotronLoss(self.config) + + @staticmethod + def get_characters(config: Coqpit) -> str: + # TODO: implement CharacterProcessor + if config.characters is not None: + symbols, phonemes = make_symbols(**config.characters) + else: + from TTS.tts.utils.text.symbols import ( # pylint: disable=import-outside-toplevel + parse_symbols, + phonemes, + symbols, + ) + + config.characters = parse_symbols() + model_characters = phonemes if config.use_phonemes else symbols + return model_characters, config + + @staticmethod + def get_speaker_manager(config: Coqpit, restore_path: str, data: List, out_path: str = None) -> SpeakerManager: + return get_speaker_manager(config, restore_path, data, out_path) + + def get_aux_input(self, **kwargs) -> Dict: + """Compute Tacotron's auxiliary inputs based on model config. + - speaker d_vector + - style wav for GST + - speaker ID for speaker embedding + """ + # setup speaker_id + if self.config.use_speaker_embedding: + speaker_id = kwargs.get("speaker_id", 0) + else: + speaker_id = None + # setup d_vector + d_vector = ( + self.speaker_manager.get_d_vectors_by_speaker(self.speaker_manager.speaker_names[0]) + if self.config.use_d_vector_file and self.config.use_speaker_embedding + else None + ) + # setup style_mel + if "style_wav" in kwargs: + style_wav = kwargs["style_wav"] + elif self.config.has("gst_style_input"): + style_wav = self.config.gst_style_input + else: + style_wav = None + if style_wav is None and "use_gst" in self.config and self.config.use_gst: + # inicialize GST with zero dict. + style_wav = {} + print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") + for i in range(self.config.gst["gst_num_style_tokens"]): + style_wav[str(i)] = 0 + aux_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector} + return aux_inputs + + ############################# + # COMMON COMPUTE FUNCTIONS + ############################# + + def compute_masks(self, text_lengths, mel_lengths): + """Compute masks against sequence paddings.""" + # B x T_in_max (boolean) + input_mask = sequence_mask(text_lengths) + output_mask = None + if mel_lengths is not None: + max_len = mel_lengths.max() + r = self.decoder.r + max_len = max_len + (r - (max_len % r)) if max_len % r > 0 else max_len + output_mask = sequence_mask(mel_lengths, max_len=max_len) + return input_mask, output_mask + + def _backward_pass(self, mel_specs, encoder_outputs, mask): + """Run backwards decoder""" + decoder_outputs_b, alignments_b, _ = self.decoder_backward( + encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask + ) + decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous() + return decoder_outputs_b, alignments_b + + def _coarse_decoder_pass(self, mel_specs, encoder_outputs, alignments, input_mask): + """Double Decoder Consistency""" + T = mel_specs.shape[1] + if T % self.coarse_decoder.r > 0: + padding_size = self.coarse_decoder.r - (T % self.coarse_decoder.r) + mel_specs = torch.nn.functional.pad(mel_specs, (0, 0, 0, padding_size, 0, 0)) + decoder_outputs_backward, alignments_backward, _ = self.coarse_decoder( + encoder_outputs.detach(), mel_specs, input_mask + ) + # scale_factor = self.decoder.r_init / self.decoder.r + alignments_backward = torch.nn.functional.interpolate( + alignments_backward.transpose(1, 2), size=alignments.shape[1], mode="nearest" + ).transpose(1, 2) + decoder_outputs_backward = decoder_outputs_backward.transpose(1, 2) + decoder_outputs_backward = decoder_outputs_backward[:, :T, :] + return decoder_outputs_backward, alignments_backward + + ############################# + # EMBEDDING FUNCTIONS + ############################# + + def compute_speaker_embedding(self, speaker_ids): + """Compute speaker embedding vectors""" + if hasattr(self, "speaker_embedding") and speaker_ids is None: + raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided") + if hasattr(self, "speaker_embedding") and speaker_ids is not None: + self.embedded_speakers = self.speaker_embedding(speaker_ids).unsqueeze(1) + if hasattr(self, "speaker_project_mel") and speaker_ids is not None: + self.embedded_speakers_projected = self.speaker_project_mel(self.embedded_speakers).squeeze(1) + + def compute_gst(self, inputs, style_input, speaker_embedding=None): + """Compute global style token""" + if isinstance(style_input, dict): + query = torch.zeros(1, 1, self.gst.gst_embedding_dim // 2).type_as(inputs) + if speaker_embedding is not None: + query = torch.cat([query, speaker_embedding.reshape(1, 1, -1)], dim=-1) + + _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) + gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).type_as(inputs) + for k_token, v_amplifier in style_input.items(): + key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) + gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) + gst_outputs = gst_outputs + gst_outputs_att * v_amplifier + elif style_input is None: + gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).type_as(inputs) + else: + gst_outputs = self.gst_layer(style_input, speaker_embedding) # pylint: disable=not-callable + inputs = self._concat_speaker_embedding(inputs, gst_outputs) + return inputs + + @staticmethod + def _add_speaker_embedding(outputs, embedded_speakers): + embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1) + outputs = outputs + embedded_speakers_ + return outputs + + @staticmethod + def _concat_speaker_embedding(outputs, embedded_speakers): + embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1) + outputs = torch.cat([outputs, embedded_speakers_], dim=-1) + return outputs + + ############################# + # CALLBACKS + ############################# + + def on_epoch_start(self, trainer): + """Callback for setting values wrt gradual training schedule. + + Args: + trainer (TrainerTTS): TTS trainer object that is used to train this model. + """ + if self.gradual_training: + r, trainer.config.batch_size = gradual_training_scheduler(trainer.total_steps_done, trainer.config) + trainer.config.r = r + self.decoder.set_r(r) + if trainer.config.bidirectional_decoder: + trainer.model.decoder_backward.set_r(r) + print(f"\n > Number of output frames: {self.decoder.r}") diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py new file mode 100644 index 00000000..2ec268d6 --- /dev/null +++ b/TTS/tts/models/base_tts.py @@ -0,0 +1,234 @@ +from typing import Dict, List, Tuple + +import numpy as np +import torch +from coqpit import Coqpit +from torch import nn +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler + +from TTS.model import BaseModel +from TTS.tts.datasets import TTSDataset +from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager +from TTS.tts.utils.synthesis import synthesis +from TTS.tts.utils.text import make_symbols +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor + +# pylint: skip-file + + +class BaseTTS(BaseModel): + """Abstract `tts` class. Every new `tts` model must inherit this. + + It defines `tts` specific functions on top of `Model`. + + Notes on input/output tensor shapes: + Any input or output tensor of the model must be shaped as + + - 3D tensors `batch x time x channels` + - 2D tensors `batch x channels` + - 1D tensors `batch x 1` + """ + + @staticmethod + def get_characters(config: Coqpit) -> str: + # TODO: implement CharacterProcessor + if config.characters is not None: + symbols, phonemes = make_symbols(**config.characters) + else: + from TTS.tts.utils.text.symbols import parse_symbols, phonemes, symbols + + config.characters = parse_symbols() + model_characters = phonemes if config.use_phonemes else symbols + num_chars = len(model_characters) + getattr(config, "add_blank", False) + return model_characters, config, num_chars + + def get_speaker_manager(config: Coqpit, restore_path: str, data: List, out_path: str = None) -> SpeakerManager: + return get_speaker_manager(config, restore_path, data, out_path) + + def init_multispeaker(self, config: Coqpit, data: List = None): + """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer + or with external `d_vectors` computed from a speaker encoder model. + + If you need a different behaviour, override this function for your model. + + Args: + config (Coqpit): Model configuration. + data (List, optional): Dataset items to infer number of speakers. Defaults to None. + """ + # init speaker manager + self.speaker_manager = get_speaker_manager(config, data=data) + self.num_speakers = self.speaker_manager.num_speakers + # init speaker embedding layer + if config.use_speaker_embedding and not config.use_d_vector_file: + self.embedded_speaker_dim = ( + config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512 + ) + self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) + self.speaker_embedding.weight.data.normal_(0, 0.3) + + def get_aux_input(self, **kwargs) -> Dict: + """Prepare and return `aux_input` used by `forward()`""" + pass + + def format_batch(self, batch: Dict) -> Dict: + """Generic batch formatting for `TTSDataset`. + + You must override this if you use a custom dataset. + + Args: + batch (Dict): [description] + + Returns: + Dict: [description] + """ + # setup input batch + text_input = batch[0] + text_lengths = batch[1] + speaker_names = batch[2] + linear_input = batch[3] if self.config.model.lower() in ["tacotron"] else None + mel_input = batch[4] + mel_lengths = batch[5] + stop_targets = batch[6] + item_idx = batch[7] + d_vectors = batch[8] + speaker_ids = batch[9] + attn_mask = batch[10] + max_text_length = torch.max(text_lengths.float()) + max_spec_length = torch.max(mel_lengths.float()) + + # compute durations from attention masks + durations = None + if attn_mask is not None: + durations = torch.zeros(attn_mask.shape[0], attn_mask.shape[2]) + for idx, am in enumerate(attn_mask): + # compute raw durations + c_idxs = am[:, : text_lengths[idx], : mel_lengths[idx]].max(1)[1] + # c_idxs, counts = torch.unique_consecutive(c_idxs, return_counts=True) + c_idxs, counts = torch.unique(c_idxs, return_counts=True) + dur = torch.ones([text_lengths[idx]]).to(counts.dtype) + dur[c_idxs] = counts + # smooth the durations and set any 0 duration to 1 + # by cutting off from the largest duration indeces. + extra_frames = dur.sum() - mel_lengths[idx] + largest_idxs = torch.argsort(-dur)[:extra_frames] + dur[largest_idxs] -= 1 + assert ( + dur.sum() == mel_lengths[idx] + ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}" + durations[idx, : text_lengths[idx]] = dur + + # set stop targets view, we predict a single stop token per iteration. + stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // self.config.r, -1) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) + + return { + "text_input": text_input, + "text_lengths": text_lengths, + "speaker_names": speaker_names, + "mel_input": mel_input, + "mel_lengths": mel_lengths, + "linear_input": linear_input, + "stop_targets": stop_targets, + "attn_mask": attn_mask, + "durations": durations, + "speaker_ids": speaker_ids, + "d_vectors": d_vectors, + "max_text_length": float(max_text_length), + "max_spec_length": float(max_spec_length), + "item_idx": item_idx, + } + + def get_data_loader( + self, config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: List, verbose: bool, num_gpus: int + ) -> "DataLoader": + if is_eval and not config.run_eval: + loader = None + else: + # setup multi-speaker attributes + if hasattr(self, "speaker_manager"): + speaker_id_mapping = self.speaker_manager.speaker_ids if config.use_speaker_embedding else None + d_vector_mapping = ( + self.speaker_manager.d_vectors + if config.use_speaker_embedding and config.use_d_vector_file + else None + ) + else: + speaker_id_mapping = None + d_vector_mapping = None + + # init dataloader + dataset = TTSDataset( + outputs_per_step=config.r if "r" in config else 1, + text_cleaner=config.text_cleaner, + compute_linear_spec=config.model.lower() == "tacotron", + meta_data=data_items, + ap=ap, + characters=config.characters, + add_blank=config["add_blank"], + batch_group_size=0 if is_eval else config.batch_group_size * config.batch_size, + min_seq_len=config.min_seq_len, + max_seq_len=config.max_seq_len, + phoneme_cache_path=config.phoneme_cache_path, + use_phonemes=config.use_phonemes, + phoneme_language=config.phoneme_language, + enable_eos_bos=config.enable_eos_bos_chars, + use_noise_augment=not is_eval, + verbose=verbose, + speaker_id_mapping=speaker_id_mapping, + d_vector_mapping=d_vector_mapping + if config.use_speaker_embedding and config.use_d_vector_file + else None, + ) + + if config.use_phonemes and config.compute_input_seq_cache: + # precompute phonemes to have a better estimate of sequence lengths. + dataset.compute_input_seq(config.num_loader_workers) + dataset.sort_items() + + sampler = DistributedSampler(dataset) if num_gpus > 1 else None + loader = DataLoader( + dataset, + batch_size=config.eval_batch_size if is_eval else config.batch_size, + shuffle=False, + collate_fn=dataset.collate_fn, + drop_last=False, + sampler=sampler, + num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers, + pin_memory=False, + ) + return loader + + def test_run(self) -> Tuple[Dict, Dict]: + """Generic test run for `tts` models used by `Trainer`. + + You can override this for a different behaviour. + + Returns: + Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard. + """ + print(" | > Synthesizing test sentences.") + test_audios = {} + test_figures = {} + test_sentences = self.config.test_sentences + aux_inputs = self._get_aux_inputs() + for idx, sen in enumerate(test_sentences): + wav, alignment, model_outputs, _ = synthesis( + self.model, + sen, + self.config, + self.use_cuda, + self.ap, + speaker_id=aux_inputs["speaker_id"], + d_vector=aux_inputs["d_vector"], + style_wav=aux_inputs["style_wav"], + enable_eos_bos_chars=self.config.enable_eos_bos_chars, + use_griffin_lim=True, + do_trim_silence=False, + ).values() + + test_audios["{}-audio".format(idx)] = wav + test_figures["{}-prediction".format(idx)] = plot_spectrogram(model_outputs, self.ap, output_fig=False) + test_figures["{}-alignment".format(idx)] = plot_alignment(alignment, output_fig=False) + return test_figures, test_audios diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 19eb594a..9f235fad 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -4,132 +4,116 @@ import torch from torch import nn from torch.nn import functional as F +from TTS.tts.configs import GlowTTSConfig from TTS.tts.layers.glow_tts.decoder import Decoder from TTS.tts.layers.glow_tts.encoder import Encoder from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.models.base_tts import BaseTTS +from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.speakers import get_speaker_manager +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor -class GlowTTS(nn.Module): +class GlowTTS(BaseTTS): """Glow TTS models from https://arxiv.org/abs/2005.11129 - Args: - num_chars (int): number of embedding characters. - hidden_channels_enc (int): number of embedding and encoder channels. - hidden_channels_dec (int): number of decoder channels. - use_encoder_prenet (bool): enable/disable prenet for encoder. Prenet modules are hard-coded for each alternative encoder. - hidden_channels_dp (int): number of duration predictor channels. - out_channels (int): number of output channels. It should be equal to the number of spectrogram filter. - num_flow_blocks_dec (int): number of decoder blocks. - kernel_size_dec (int): decoder kernel size. - dilation_rate (int): rate to increase dilation by each layer in a decoder block. - num_block_layers (int): number of decoder layers in each decoder block. - dropout_p_dec (float): dropout rate for decoder. - num_speaker (int): number of speaker to define the size of speaker embedding layer. - c_in_channels (int): number of speaker embedding channels. It is set to 512 if embeddings are learned. - num_splits (int): number of split levels in inversible conv1x1 operation. - num_squeeze (int): number of squeeze levels. When squeezing channels increases and time steps reduces by the factor 'num_squeeze'. - sigmoid_scale (bool): enable/disable sigmoid scaling in decoder. - mean_only (bool): if True, encoder only computes mean value and uses constant variance for each time step. - encoder_type (str): encoder module type. - encoder_params (dict): encoder module parameters. - speaker_embedding_dim (int): channels of external speaker embedding vectors. + Paper abstract: + Recently, text-to-speech (TTS) models such as FastSpeech and ParaNet have been proposed to generate + mel-spectrograms from text in parallel. Despite the advantage, the parallel TTS models cannot be trained + without guidance from autoregressive TTS models as their external aligners. In this work, we propose Glow-TTS, + a flow-based generative model for parallel TTS that does not require any external aligner. By combining the + properties of flows and dynamic programming, the proposed model searches for the most probable monotonic + alignment between text and the latent representation of speech on its own. We demonstrate that enforcing hard + monotonic alignments enables robust TTS, which generalizes to long utterances, and employing generative flows + enables fast, diverse, and controllable speech synthesis. Glow-TTS obtains an order-of-magnitude speed-up over + the autoregressive model, Tacotron 2, at synthesis with comparable speech quality. We further show that our + model can be easily extended to a multi-speaker setting. + + Check :class:`TTS.tts.configs.glow_tts_config.GlowTTSConfig` for class arguments. + + Examples: + >>> from TTS.tts.configs import GlowTTSConfig + >>> from TTS.tts.models.glow_tts import GlowTTS + >>> config = GlowTTSConfig() + >>> model = GlowTTS(config) + """ - def __init__( - self, - num_chars, - hidden_channels_enc, - hidden_channels_dec, - use_encoder_prenet, - hidden_channels_dp, - out_channels, - num_flow_blocks_dec=12, - inference_noise_scale=0.33, - kernel_size_dec=5, - dilation_rate=5, - num_block_layers=4, - dropout_p_dp=0.1, - dropout_p_dec=0.05, - num_speakers=0, - c_in_channels=0, - num_splits=4, - num_squeeze=1, - sigmoid_scale=False, - mean_only=False, - encoder_type="transformer", - encoder_params=None, - speaker_embedding_dim=None, - ): + def __init__(self, config: GlowTTSConfig): super().__init__() - self.num_chars = num_chars - self.hidden_channels_dp = hidden_channels_dp - self.hidden_channels_enc = hidden_channels_enc - self.hidden_channels_dec = hidden_channels_dec - self.out_channels = out_channels - self.num_flow_blocks_dec = num_flow_blocks_dec - self.kernel_size_dec = kernel_size_dec - self.dilation_rate = dilation_rate - self.num_block_layers = num_block_layers - self.dropout_p_dec = dropout_p_dec - self.num_speakers = num_speakers - self.c_in_channels = c_in_channels - self.num_splits = num_splits - self.num_squeeze = num_squeeze - self.sigmoid_scale = sigmoid_scale - self.mean_only = mean_only - self.use_encoder_prenet = use_encoder_prenet - self.inference_noise_scale = inference_noise_scale - # model constants. - self.noise_scale = 0.33 # defines the noise variance applied to the random z vector at inference. - self.length_scale = 1.0 # scaler for the duration predictor. The larger it is, the slower the speech. - self.speaker_embedding_dim = speaker_embedding_dim + # pass all config fields to `self` + # for fewer code change + self.config = config + for key in config: + setattr(self, key, config[key]) + + _, self.config, self.num_chars = self.get_characters(config) + self.decoder_output_dim = config.out_channels + + self.init_multispeaker(config) # if is a multispeaker and c_in_channels is 0, set to 256 - if num_speakers > 1: - if self.c_in_channels == 0 and not self.speaker_embedding_dim: + self.c_in_channels = 0 + if self.num_speakers > 1: + if self.d_vector_dim: + self.c_in_channels = self.d_vector_dim + elif self.c_in_channels == 0 and not self.d_vector_dim: # TODO: make this adjustable self.c_in_channels = 256 - elif self.speaker_embedding_dim: - self.c_in_channels = self.speaker_embedding_dim self.encoder = Encoder( - num_chars, - out_channels=out_channels, - hidden_channels=hidden_channels_enc, - hidden_channels_dp=hidden_channels_dp, - encoder_type=encoder_type, - encoder_params=encoder_params, - mean_only=mean_only, - use_prenet=use_encoder_prenet, - dropout_p_dp=dropout_p_dp, + self.num_chars, + out_channels=self.out_channels, + hidden_channels=self.hidden_channels_enc, + hidden_channels_dp=self.hidden_channels_dp, + encoder_type=self.encoder_type, + encoder_params=self.encoder_params, + mean_only=self.mean_only, + use_prenet=self.use_encoder_prenet, + dropout_p_dp=self.dropout_p_dp, c_in_channels=self.c_in_channels, ) self.decoder = Decoder( - out_channels, - hidden_channels_dec, - kernel_size_dec, - dilation_rate, - num_flow_blocks_dec, - num_block_layers, - dropout_p=dropout_p_dec, - num_splits=num_splits, - num_squeeze=num_squeeze, - sigmoid_scale=sigmoid_scale, + self.out_channels, + self.hidden_channels_dec, + self.kernel_size_dec, + self.dilation_rate, + self.num_flow_blocks_dec, + self.num_block_layers, + dropout_p=self.dropout_p_dec, + num_splits=self.num_splits, + num_squeeze=self.num_squeeze, + sigmoid_scale=self.sigmoid_scale, c_in_channels=self.c_in_channels, ) - if num_speakers > 1 and not speaker_embedding_dim: - # speaker embedding layer - self.emb_g = nn.Embedding(num_speakers, self.c_in_channels) + def init_multispeaker(self, config: "Coqpit", data: list = None) -> None: + """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer + or with external `d_vectors` computed from a speaker encoder model. + + If you need a different behaviour, override this function for your model. + + Args: + config (Coqpit): Model configuration. + data (List, optional): Dataset items to infer number of speakers. Defaults to None. + """ + # init speaker manager + self.speaker_manager = get_speaker_manager(config, data=data) + self.num_speakers = self.speaker_manager.num_speakers + # init speaker embedding layer + if config.use_speaker_embedding and not config.use_d_vector_file: + self.embedded_speaker_dim = self.c_in_channels + self.emb_g = nn.Embedding(self.num_speakers, self.embedded_speaker_dim) nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) @staticmethod def compute_outputs(attn, o_mean, o_log_scale, x_mask): - # compute final values with the computed alignment + """ Compute and format the mode outputs with the given alignment map""" y_mean = torch.matmul(attn.squeeze(1).transpose(1, 2), o_mean.transpose(1, 2)).transpose( 1, 2 ) # [b, t', t], [b, t, d] -> [b, d, t'] @@ -140,19 +124,23 @@ class GlowTTS(nn.Module): o_attn_dur = torch.log(1 + torch.sum(attn, -1)) * x_mask return y_mean, y_log_scale, o_attn_dur - def forward(self, x, x_lengths, y=None, y_lengths=None, attn=None, g=None): + def forward( + self, x, x_lengths, y, y_lengths=None, aux_input={"d_vectors": None} + ): # pylint: disable=dangerous-default-value """ Shapes: - x: [B, T] - x_lenghts: B - y: [B, C, T] - y_lengths: B - g: [B, C] or B + - x: :math:`[B, T]` + - x_lenghts::math:` B` + - y: :math:`[B, T, C]` + - y_lengths::math:` B` + - g: :math:`[B, C] or B` """ + y = y.transpose(1, 2) y_max_length = y.size(2) # norm speaker embeddings + g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None if g is not None: - if self.speaker_embedding_dim: + if self.d_vector_dim: g = F.normalize(g).unsqueeze(-1) else: g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] @@ -177,24 +165,38 @@ class GlowTTS(nn.Module): attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach() y_mean, y_log_scale, o_attn_dur = self.compute_outputs(attn, o_mean, o_log_scale, x_mask) attn = attn.squeeze(1).permute(0, 2, 1) - return z, logdet, y_mean, y_log_scale, attn, o_dur_log, o_attn_dur + outputs = { + "model_outputs": z.transpose(1, 2), + "logdet": logdet, + "y_mean": y_mean.transpose(1, 2), + "y_log_scale": y_log_scale.transpose(1, 2), + "alignments": attn, + "durations_log": o_dur_log.transpose(1, 2), + "total_durations_log": o_attn_dur.transpose(1, 2), + } + return outputs @torch.no_grad() - def inference_with_MAS(self, x, x_lengths, y=None, y_lengths=None, attn=None, g=None): + def inference_with_MAS( + self, x, x_lengths, y=None, y_lengths=None, aux_input={"d_vectors": None} + ): # pylint: disable=dangerous-default-value """ It's similar to the teacher forcing in Tacotron. It was proposed in: https://arxiv.org/abs/2104.05557 + Shapes: - x: [B, T] - x_lenghts: B - y: [B, C, T] - y_lengths: B - g: [B, C] or B + - x: :math:`[B, T]` + - x_lenghts: :math:`B` + - y: :math:`[B, T, C]` + - y_lengths: :math:`B` + - g: :math:`[B, C] or B` """ + y = y.transpose(1, 2) y_max_length = y.size(2) # norm speaker embeddings + g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None if g is not None: - if self.external_speaker_embedding_dim: + if self.external_d_vector_dim: g = F.normalize(g).unsqueeze(-1) else: g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] @@ -225,21 +227,33 @@ class GlowTTS(nn.Module): # reverse the decoder and predict using the aligned distribution y, logdet = self.decoder(z, y_mask, g=g, reverse=True) - - return y, logdet, y_mean, y_log_scale, attn, o_dur_log, o_attn_dur + outputs = { + "model_outputs": z.transpose(1, 2), + "logdet": logdet, + "y_mean": y_mean.transpose(1, 2), + "y_log_scale": y_log_scale.transpose(1, 2), + "alignments": attn, + "durations_log": o_dur_log.transpose(1, 2), + "total_durations_log": o_attn_dur.transpose(1, 2), + } + return outputs @torch.no_grad() - def decoder_inference(self, y, y_lengths=None, g=None): + def decoder_inference( + self, y, y_lengths=None, aux_input={"d_vectors": None} + ): # pylint: disable=dangerous-default-value """ Shapes: - y: [B, C, T] - y_lengths: B - g: [B, C] or B + - y: :math:`[B, T, C]` + - y_lengths: :math:`B` + - g: :math:`[B, C] or B` """ + y = y.transpose(1, 2) y_max_length = y.size(2) + g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None # norm speaker embeddings if g is not None: - if self.external_speaker_embedding_dim: + if self.external_d_vector_dim: g = F.normalize(g).unsqueeze(-1) else: g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] @@ -252,12 +266,18 @@ class GlowTTS(nn.Module): # reverse decoder and predict y, logdet = self.decoder(z, y_mask, g=g, reverse=True) - return y, logdet + outputs = {} + outputs["model_outputs"] = y.transpose(1, 2) + outputs["logdet"] = logdet + return outputs @torch.no_grad() - def inference(self, x, x_lengths, g=None): + def inference(self, x, aux_input={"x_lengths": None, "d_vectors": None}): # pylint: disable=dangerous-default-value + x_lengths = aux_input["x_lengths"] + g = aux_input["d_vectors"] if aux_input is not None and "d_vectors" in aux_input else None + if g is not None: - if self.speaker_embedding_dim: + if self.d_vector_dim: g = F.normalize(g).unsqueeze(-1) else: g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h] @@ -280,7 +300,72 @@ class GlowTTS(nn.Module): # decoder pass y, logdet = self.decoder(z, y_mask, g=g, reverse=True) attn = attn.squeeze(1).permute(0, 2, 1) - return y, logdet, y_mean, y_log_scale, attn, o_dur_log, o_attn_dur + outputs = { + "model_outputs": y.transpose(1, 2), + "logdet": logdet, + "y_mean": y_mean.transpose(1, 2), + "y_log_scale": y_log_scale.transpose(1, 2), + "alignments": attn, + "durations_log": o_dur_log.transpose(1, 2), + "total_durations_log": o_attn_dur.transpose(1, 2), + } + return outputs + + def train_step(self, batch: dict, criterion: nn.Module): + """Perform a single training step by fetching the right set if samples from the batch. + + Args: + batch (dict): [description] + criterion (nn.Module): [description] + """ + text_input = batch["text_input"] + text_lengths = batch["text_lengths"] + mel_input = batch["mel_input"] + mel_lengths = batch["mel_lengths"] + d_vectors = batch["d_vectors"] + + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input={"d_vectors": d_vectors}) + + loss_dict = criterion( + outputs["model_outputs"], + outputs["y_mean"], + outputs["y_log_scale"], + outputs["logdet"], + mel_lengths, + outputs["durations_log"], + outputs["total_durations_log"], + text_lengths, + ) + + # compute alignment error (the lower the better ) + align_error = 1 - alignment_diagonal_score(outputs["alignments"], binary=True) + loss_dict["align_error"] = align_error + return outputs, loss_dict + + def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): # pylint: disable=no-self-use + model_outputs = outputs["model_outputs"] + alignments = outputs["alignments"] + mel_input = batch["mel_input"] + + pred_spec = model_outputs[0].data.cpu().numpy() + gt_spec = mel_input[0].data.cpu().numpy() + align_img = alignments[0].data.cpu().numpy() + + figures = { + "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), + "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), + "alignment": plot_alignment(align_img, output_fig=False), + } + + # Sample audio + train_audio = ap.inv_melspectrogram(pred_spec.T) + return figures, {"audio": train_audio} + + def eval_step(self, batch: dict, criterion: nn.Module): + return self.train_step(batch, criterion) + + def eval_log(self, ap: AudioProcessor, batch: dict, outputs: dict): + return self.train_log(ap, batch, outputs) def preprocess(self, y, y_lengths, y_max_length, attn=None): if y_max_length is not None: @@ -303,3 +388,8 @@ class GlowTTS(nn.Module): self.eval() self.store_inverse() assert not self.training + + def get_criterion(self): + from TTS.tts.layers.losses import GlowTTSLoss # pylint: disable=import-outside-toplevel + + return GlowTTSLoss() diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index 9880b82b..8f14d610 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -1,4 +1,7 @@ +from dataclasses import dataclass, field + import torch +from coqpit import Coqpit from torch import nn from TTS.tts.layers.feed_forward.decoder import Decoder @@ -6,21 +9,16 @@ from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.monotonic_align import generate_path -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.models.base_tts import BaseTTS +from TTS.tts.utils.data import sequence_mask +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor -class SpeedySpeech(nn.Module): - """Speedy Speech model - https://arxiv.org/abs/2008.03802 - - Encoder -> DurationPredictor -> Decoder - - This model is able to achieve a reasonable performance with only - ~3M model parameters and convolutional layers. - - This model requires precomputed phoneme durations to train a duration predictor. At inference - it only uses the duration predictor to compute durations and expand encoder outputs respectively. - +@dataclass +class SpeedySpeechArgs(Coqpit): + """ Args: num_chars (int): number of unique input to characters out_channels (int): number of output tensor channels. It is equal to the expected spectrogram size. @@ -32,49 +30,106 @@ class SpeedySpeech(nn.Module): decoder_type (str, optional): decoder type. Defaults to 'residual_conv_bn'. decoder_params (dict, optional): set decoder parameters depending on 'decoder_type'. Defaults to { "kernel_size": 4, "dilations": 4 * [1, 2, 4, 8] + [1], "num_conv_blocks": 2, "num_res_blocks": 17 }. num_speakers (int, optional): number of speakers for multi-speaker training. Defaults to 0. - external_c (bool, optional): enable external speaker embeddings. Defaults to False. - c_in_channels (int, optional): number of channels in speaker embedding vectors. Defaults to 0. + use_d_vector (bool, optional): enable external speaker embeddings. Defaults to False. + d_vector_dim (int, optional): number of channels in speaker embedding vectors. Defaults to 0. """ - # pylint: disable=dangerous-default-value - - def __init__( - self, - num_chars, - out_channels, - hidden_channels, - positional_encoding=True, - length_scale=1, - encoder_type="residual_conv_bn", - encoder_params={"kernel_size": 4, "dilations": 4 * [1, 2, 4] + [1], "num_conv_blocks": 2, "num_res_blocks": 13}, - decoder_type="residual_conv_bn", - decoder_params={ + num_chars: int = None + out_channels: int = 80 + hidden_channels: int = 128 + num_speakers: int = 0 + positional_encoding: bool = True + length_scale: int = 1 + encoder_type: str = "residual_conv_bn" + encoder_params: dict = field( + default_factory=lambda: { + "kernel_size": 4, + "dilations": 4 * [1, 2, 4] + [1], + "num_conv_blocks": 2, + "num_res_blocks": 13, + } + ) + decoder_type: str = "residual_conv_bn" + decoder_params: dict = field( + default_factory=lambda: { "kernel_size": 4, "dilations": 4 * [1, 2, 4, 8] + [1], "num_conv_blocks": 2, "num_res_blocks": 17, - }, - num_speakers=0, - external_c=False, - c_in_channels=0, - ): + } + ) + use_d_vector: bool = False + d_vector_dim: int = 0 + +class SpeedySpeech(BaseTTS): + """Speedy Speech model + https://arxiv.org/abs/2008.03802 + + Encoder -> DurationPredictor -> Decoder + + Paper abstract: + While recent neural sequence-to-sequence models have greatly improved the quality of speech + synthesis, there has not been a system capable of fast training, fast inference and high-quality audio synthesis + at the same time. We propose a student-teacher network capable of high-quality faster-than-real-time spectrogram + synthesis, with low requirements on computational resources and fast training time. We show that self-attention + layers are not necessary for generation of high quality audio. We utilize simple convolutional blocks with + residual connections in both student and teacher networks and use only a single attention layer in the teacher + model. Coupled with a MelGAN vocoder, our model's voice quality was rated significantly higher than Tacotron 2. + Our model can be efficiently trained on a single GPU and can run in real time even on a CPU. We provide both + our source code and audio samples in our GitHub repository. + + Notes: + The vanilla model is able to achieve a reasonable performance with only + ~3M model parameters and convolutional layers. + + This model requires precomputed phoneme durations to train a duration predictor. At inference + it only uses the duration predictor to compute durations and expand encoder outputs respectively. + + You can also mix and match different encoder and decoder networks beyond the paper. + + Check `SpeedySpeechArgs` for arguments. + """ + + # pylint: disable=dangerous-default-value + + def __init__(self, config: Coqpit): super().__init__() - self.length_scale = float(length_scale) if isinstance(length_scale, int) else length_scale - self.emb = nn.Embedding(num_chars, hidden_channels) - self.encoder = Encoder(hidden_channels, hidden_channels, encoder_type, encoder_params, c_in_channels) - if positional_encoding: - self.pos_encoder = PositionalEncoding(hidden_channels) - self.decoder = Decoder(out_channels, hidden_channels, decoder_type, decoder_params) - self.duration_predictor = DurationPredictor(hidden_channels + c_in_channels) + self.config = config - if num_speakers > 1 and not external_c: + if "characters" in config: + _, self.config, self.num_chars = self.get_characters(config) + + self.length_scale = ( + float(config.model_args.length_scale) + if isinstance(config.model_args.length_scale, int) + else config.model_args.length_scale + ) + self.emb = nn.Embedding(config.model_args.num_chars, config.model_args.hidden_channels) + self.encoder = Encoder( + config.model_args.hidden_channels, + config.model_args.hidden_channels, + config.model_args.encoder_type, + config.model_args.encoder_params, + config.model_args.d_vector_dim, + ) + if config.model_args.positional_encoding: + self.pos_encoder = PositionalEncoding(config.model_args.hidden_channels) + self.decoder = Decoder( + config.model_args.out_channels, + config.model_args.hidden_channels, + config.model_args.decoder_type, + config.model_args.decoder_params, + ) + self.duration_predictor = DurationPredictor(config.model_args.hidden_channels + config.model_args.d_vector_dim) + + if config.model_args.num_speakers > 1 and not config.model_args.use_d_vector: # speaker embedding layer - self.emb_g = nn.Embedding(num_speakers, c_in_channels) + self.emb_g = nn.Embedding(config.model_args.num_speakers, config.model_args.d_vector_dim) nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) - if c_in_channels > 0 and c_in_channels != hidden_channels: - self.proj_g = nn.Conv1d(c_in_channels, hidden_channels, 1) + if config.model_args.d_vector_dim > 0 and config.model_args.d_vector_dim != config.model_args.hidden_channels: + self.proj_g = nn.Conv1d(config.model_args.d_vector_dim, config.model_args.hidden_channels, 1) @staticmethod def expand_encoder_outputs(en, dr, x_mask, y_mask): @@ -153,8 +208,11 @@ class SpeedySpeech(nn.Module): o_de = self.decoder(o_en_ex, y_mask, g=g) return o_de, attn.transpose(1, 2) - def forward(self, x, x_lengths, y_lengths, dr, g=None): # pylint: disable=unused-argument + def forward( + self, x, x_lengths, y_lengths, dr, aux_input={"d_vectors": None, "speaker_ids": None} + ): # pylint: disable=unused-argument """ + TODO: speaker embedding for speaker_ids Shapes: x: [B, T_max] x_lengths: [B] @@ -162,18 +220,22 @@ class SpeedySpeech(nn.Module): dr: [B, T_max] g: [B, C] """ + g = aux_input["d_vectors"] if "d_vectors" in aux_input else None o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) o_de, attn = self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g) - return o_de, o_dr_log.squeeze(1), attn + outputs = {"model_outputs": o_de.transpose(1, 2), "durations_log": o_dr_log.squeeze(1), "alignments": attn} + return outputs - def inference(self, x, x_lengths, g=None): # pylint: disable=unused-argument + def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] x_lengths: [B] g: [B, C] """ + g = aux_input["d_vectors"] if "d_vectors" in aux_input else None + x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # input sequence should be greated than the max convolution size inference_padding = 5 if x.shape[1] < 13: @@ -186,7 +248,60 @@ class SpeedySpeech(nn.Module): o_dr = self.format_durations(o_dr_log, x_mask).squeeze(1) y_lengths = o_dr.sum(1) o_de, attn = self._forward_decoder(o_en, o_en_dp, o_dr, x_mask, y_lengths, g=g) - return o_de, attn + outputs = {"model_outputs": o_de.transpose(1, 2), "alignments": attn, "durations_log": None} + return outputs + + def train_step(self, batch: dict, criterion: nn.Module): + text_input = batch["text_input"] + text_lengths = batch["text_lengths"] + mel_input = batch["mel_input"] + mel_lengths = batch["mel_lengths"] + d_vectors = batch["d_vectors"] + speaker_ids = batch["speaker_ids"] + durations = batch["durations"] + + aux_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids} + outputs = self.forward(text_input, text_lengths, mel_lengths, durations, aux_input) + + # compute loss + loss_dict = criterion( + outputs["model_outputs"], + mel_input, + mel_lengths, + outputs["durations_log"], + torch.log(1 + durations), + text_lengths, + ) + + # compute alignment error (the lower the better ) + align_error = 1 - alignment_diagonal_score(outputs["alignments"], binary=True) + loss_dict["align_error"] = align_error + return outputs, loss_dict + + def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict): # pylint: disable=no-self-use + model_outputs = outputs["model_outputs"] + alignments = outputs["alignments"] + mel_input = batch["mel_input"] + + pred_spec = model_outputs[0].data.cpu().numpy() + gt_spec = mel_input[0].data.cpu().numpy() + align_img = alignments[0].data.cpu().numpy() + + figures = { + "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), + "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), + "alignment": plot_alignment(align_img, output_fig=False), + } + + # Sample audio + train_audio = ap.inv_melspectrogram(pred_spec.T) + return figures, {"audio": train_audio} + + def eval_step(self, batch: dict, criterion: nn.Module): + return self.train_step(batch, criterion) + + def eval_log(self, ap: AudioProcessor, batch: dict, outputs: dict): + return self.train_log(ap, batch, outputs) def load_checkpoint( self, config, checkpoint_path, eval=False @@ -196,3 +311,8 @@ class SpeedySpeech(nn.Module): if eval: self.eval() assert not self.training + + def get_criterion(self): + from TTS.tts.layers.losses import SpeedySpeechLoss # pylint: disable=import-outside-toplevel + + return SpeedySpeechLoss(self.config) diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 89d98e9f..95b4a358 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -1,157 +1,86 @@ # coding: utf-8 + +from typing import Dict, Tuple + import torch +from coqpit import Coqpit from torch import nn from TTS.tts.layers.tacotron.gst_layers import GST from TTS.tts.layers.tacotron.tacotron import Decoder, Encoder, PostCBHG -from TTS.tts.models.tacotron_abstract import TacotronAbstract +from TTS.tts.models.base_tacotron import BaseTacotron +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor -class Tacotron(TacotronAbstract): +class Tacotron(BaseTacotron): """Tacotron as in https://arxiv.org/abs/1703.10135 - It's an autoregressive encoder-attention-decoder-postnet architecture. - - Args: - num_chars (int): number of input characters to define the size of embedding layer. - num_speakers (int): number of speakers in the dataset. >1 enables multi-speaker training and model learns speaker embeddings. - r (int): initial model reduction rate. - postnet_output_dim (int, optional): postnet output channels. Defaults to 80. - decoder_output_dim (int, optional): decoder output channels. Defaults to 80. - attn_type (str, optional): attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'. - attn_win (bool, optional): enable/disable attention windowing. - It especially useful at inference to keep attention alignment diagonal. Defaults to False. - attn_norm (str, optional): Attention normalization method. "sigmoid" or "softmax". Defaults to "softmax". - prenet_type (str, optional): prenet type for the decoder. Defaults to "original". - prenet_dropout (bool, optional): prenet dropout rate. Defaults to True. - prenet_dropout_at_inference (bool, optional): use dropout at inference time. This leads to a better quality for - some models. Defaults to False. - forward_attn (bool, optional): enable/disable forward attention. - It is only valid if ```attn_type``` is ```original```. Defaults to False. - trans_agent (bool, optional): enable/disable transition agent in forward attention. Defaults to False. - forward_attn_mask (bool, optional): enable/disable extra masking over forward attention. Defaults to False. - location_attn (bool, optional): enable/disable location sensitive attention. - It is only valid if ```attn_type``` is ```original```. Defaults to True. - attn_K (int, optional): Number of attention heads for GMM attention. Defaults to 5. - separate_stopnet (bool, optional): enable/disable separate stopnet training without only gradient - flow from stopnet to the rest of the model. Defaults to True. - bidirectional_decoder (bool, optional): enable/disable bidirectional decoding. Defaults to False. - double_decoder_consistency (bool, optional): enable/disable double decoder consistency. Defaults to False. - ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None. - encoder_in_features (int, optional): input channels for the encoder. Defaults to 512. - decoder_in_features (int, optional): input channels for the decoder. Defaults to 512. - speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None. - use_gst (bool, optional): enable/disable Global style token module. - gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. - memory_size (int, optional): size of the history queue fed to the prenet. Model feeds the last ```memory_size``` - output frames to the prenet. + Check `TacotronConfig` for the arguments. """ - def __init__( - self, - num_chars, - num_speakers, - r=5, - postnet_output_dim=1025, - decoder_output_dim=80, - attn_type="original", - attn_win=False, - attn_norm="sigmoid", - prenet_type="original", - prenet_dropout=True, - prenet_dropout_at_inference=False, - forward_attn=False, - trans_agent=False, - forward_attn_mask=False, - location_attn=True, - attn_K=5, - separate_stopnet=True, - bidirectional_decoder=False, - double_decoder_consistency=False, - ddc_r=None, - encoder_in_features=256, - decoder_in_features=256, - speaker_embedding_dim=None, - use_gst=False, - gst=None, - memory_size=5, - ): - super().__init__( - num_chars, - num_speakers, - r, - postnet_output_dim, - decoder_output_dim, - attn_type, - attn_win, - attn_norm, - prenet_type, - prenet_dropout, - prenet_dropout_at_inference, - forward_attn, - trans_agent, - forward_attn_mask, - location_attn, - attn_K, - separate_stopnet, - bidirectional_decoder, - double_decoder_consistency, - ddc_r, - encoder_in_features, - decoder_in_features, - speaker_embedding_dim, - use_gst, - gst, - ) + def __init__(self, config: Coqpit): + super().__init__(config) - # speaker embedding layers + self.num_chars, self.config = self.get_characters(config) + + # pass all config fields to `self` + # for fewer code change + for key in config: + setattr(self, key, config[key]) + + # speaker embedding layer if self.num_speakers > 1: - if not self.embeddings_per_sample: - speaker_embedding_dim = 256 - self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) - self.speaker_embedding.weight.data.normal_(0, 0.3) + self.init_multispeaker(config) # speaker and gst embeddings is concat in decoder input if self.num_speakers > 1: - self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim + self.decoder_in_features += self.embedded_speaker_dim # add speaker embedding dim + + if self.use_gst: + self.decoder_in_features += self.gst.gst_embedding_dim # embedding layer - self.embedding = nn.Embedding(num_chars, 256, padding_idx=0) + self.embedding = nn.Embedding(self.num_chars, 256, padding_idx=0) self.embedding.weight.data.normal_(0, 0.3) # base model layers self.encoder = Encoder(self.encoder_in_features) self.decoder = Decoder( self.decoder_in_features, - decoder_output_dim, - r, - memory_size, - attn_type, - attn_win, - attn_norm, - prenet_type, - prenet_dropout, - forward_attn, - trans_agent, - forward_attn_mask, - location_attn, - attn_K, - separate_stopnet, + self.decoder_output_dim, + self.r, + self.memory_size, + self.attention_type, + self.windowing, + self.attention_norm, + self.prenet_type, + self.prenet_dropout, + self.use_forward_attn, + self.transition_agent, + self.forward_attn_mask, + self.location_attn, + self.attention_heads, + self.separate_stopnet, + self.max_decoder_steps, ) - self.postnet = PostCBHG(decoder_output_dim) - self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, postnet_output_dim) + self.postnet = PostCBHG(self.decoder_output_dim) + self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, self.out_channels) # setup prenet dropout - self.decoder.prenet.dropout_at_inference = prenet_dropout_at_inference + self.decoder.prenet.dropout_at_inference = self.prenet_dropout_at_inference # global style token layers if self.gst and self.use_gst: self.gst_layer = GST( - num_mel=decoder_output_dim, - speaker_embedding_dim=speaker_embedding_dim, - num_heads=gst.gst_num_heads, - num_style_tokens=gst.gst_num_style_tokens, - gst_embedding_dim=gst.gst_embedding_dim, + num_mel=self.decoder_output_dim, + d_vector_dim=self.d_vector_dim + if self.config.gst.gst_use_speaker_embedding and self.use_speaker_embedding + else None, + num_heads=self.gst.gst_num_heads, + num_style_tokens=self.gst.gst_num_style_tokens, + gst_embedding_dim=self.gst.gst_embedding_dim, ) # backward pass decoder if self.bidirectional_decoder: @@ -160,35 +89,35 @@ class Tacotron(TacotronAbstract): if self.double_decoder_consistency: self.coarse_decoder = Decoder( self.decoder_in_features, - decoder_output_dim, - ddc_r, - memory_size, - attn_type, - attn_win, - attn_norm, - prenet_type, - prenet_dropout, - forward_attn, - trans_agent, - forward_attn_mask, - location_attn, - attn_K, - separate_stopnet, + self.decoder_output_dim, + self.ddc_r, + self.memory_size, + self.attention_type, + self.windowing, + self.attention_norm, + self.prenet_type, + self.prenet_dropout, + self.use_forward_attn, + self.transition_agent, + self.forward_attn_mask, + self.location_attn, + self.attention_heads, + self.separate_stopnet, + self.max_decoder_steps, ) - def forward(self, characters, text_lengths, mel_specs, mel_lengths=None, speaker_ids=None, speaker_embeddings=None): + def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, aux_input=None): """ Shapes: - characters: [B, T_in] + text: [B, T_in] text_lengths: [B] mel_specs: [B, T_out, C] mel_lengths: [B] - speaker_ids: [B, 1] - speaker_embeddings: [B, C] + aux_input: 'speaker_ids': [B, 1] and 'd_vectors':[B, C] """ + outputs = {"alignments_backward": None, "decoder_outputs_backward": None} + inputs = self.embedding(text) input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) - # B x T_in x embed_dim - inputs = self.embedding(characters) # B x T_in x encoder_in_features encoder_outputs = self.encoder(inputs) # sequence masking @@ -196,16 +125,18 @@ class Tacotron(TacotronAbstract): # global style token if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, speaker_embeddings) + encoder_outputs = self.compute_gst( + encoder_outputs, mel_specs, aux_input["d_vectors"] if "d_vectors" in aux_input else None + ) # speaker embedding if self.num_speakers > 1: - if not self.embeddings_per_sample: + if not self.use_d_vectors: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])[:, None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) # decoder_outputs: B x decoder_in_features x T_out # alignments: B x T_in x encoder_in_features # stop_tokens: B x T_in @@ -224,45 +155,139 @@ class Tacotron(TacotronAbstract): decoder_outputs = decoder_outputs.transpose(1, 2).contiguous() if self.bidirectional_decoder: decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask) - return ( - decoder_outputs, - postnet_outputs, - alignments, - stop_tokens, - decoder_outputs_backward, - alignments_backward, - ) + outputs["alignments_backward"] = alignments_backward + outputs["decoder_outputs_backward"] = decoder_outputs_backward if self.double_decoder_consistency: decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass( mel_specs, encoder_outputs, alignments, input_mask ) - return ( - decoder_outputs, - postnet_outputs, - alignments, - stop_tokens, - decoder_outputs_backward, - alignments_backward, - ) - return decoder_outputs, postnet_outputs, alignments, stop_tokens + outputs["alignments_backward"] = alignments_backward + outputs["decoder_outputs_backward"] = decoder_outputs_backward + outputs.update( + { + "model_outputs": postnet_outputs, + "decoder_outputs": decoder_outputs, + "alignments": alignments, + "stop_tokens": stop_tokens, + } + ) + return outputs @torch.no_grad() - def inference(self, characters, speaker_ids=None, style_mel=None, speaker_embeddings=None): - inputs = self.embedding(characters) + def inference(self, text_input, aux_input=None): + aux_input = self._format_aux_input(aux_input) + inputs = self.embedding(text_input) encoder_outputs = self.encoder(inputs) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel, speaker_embeddings) + encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"]) if self.num_speakers > 1: - if not self.embeddings_per_sample: + if not self.use_d_vectors: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"]) + # reshape embedded_speakers + if embedded_speakers.ndim == 1: + embedded_speakers = embedded_speakers[None, None, :] + elif embedded_speakers.ndim == 2: + embedded_speakers = embedded_speakers[None, :] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = self.last_linear(postnet_outputs) decoder_outputs = decoder_outputs.transpose(1, 2) - return decoder_outputs, postnet_outputs, alignments, stop_tokens + outputs = { + "model_outputs": postnet_outputs, + "decoder_outputs": decoder_outputs, + "alignments": alignments, + "stop_tokens": stop_tokens, + } + return outputs + + def train_step(self, batch, criterion): + """Perform a single training step by fetching the right set if samples from the batch. + + Args: + batch ([type]): [description] + criterion ([type]): [description] + """ + text_input = batch["text_input"] + text_lengths = batch["text_lengths"] + mel_input = batch["mel_input"] + mel_lengths = batch["mel_lengths"] + linear_input = batch["linear_input"] + stop_targets = batch["stop_targets"] + speaker_ids = batch["speaker_ids"] + d_vectors = batch["d_vectors"] + + # forward pass model + outputs = self.forward( + text_input, + text_lengths, + mel_input, + mel_lengths, + aux_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors}, + ) + + # set the [alignment] lengths wrt reduction factor for guided attention + if mel_lengths.max() % self.decoder.r != 0: + alignment_lengths = ( + mel_lengths + (self.decoder.r - (mel_lengths.max() % self.decoder.r)) + ) // self.decoder.r + else: + alignment_lengths = mel_lengths // self.decoder.r + + aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input) + + # compute loss + loss_dict = criterion( + outputs["model_outputs"], + outputs["decoder_outputs"], + mel_input, + linear_input, + outputs["stop_tokens"], + stop_targets, + mel_lengths, + outputs["decoder_outputs_backward"], + outputs["alignments"], + alignment_lengths, + outputs["alignments_backward"], + text_lengths, + ) + + # compute alignment error (the lower the better ) + align_error = 1 - alignment_diagonal_score(outputs["alignments"]) + loss_dict["align_error"] = align_error + return outputs, loss_dict + + def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict) -> Tuple[Dict, Dict]: + postnet_outputs = outputs["model_outputs"] + alignments = outputs["alignments"] + alignments_backward = outputs["alignments_backward"] + mel_input = batch["mel_input"] + + pred_spec = postnet_outputs[0].data.cpu().numpy() + gt_spec = mel_input[0].data.cpu().numpy() + align_img = alignments[0].data.cpu().numpy() + + figures = { + "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), + "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), + "alignment": plot_alignment(align_img, output_fig=False), + } + + if self.bidirectional_decoder or self.double_decoder_consistency: + figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy(), output_fig=False) + + # Sample audio + train_audio = ap.inv_spectrogram(pred_spec.T) + return figures, {"audio": train_audio} + + def eval_step(self, batch, criterion): + return self.train_step(batch, criterion) + + def eval_log(self, ap, batch, outputs): + return self.train_log(ap, batch, outputs) diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 525eb8b3..eaca3ff8 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -1,151 +1,84 @@ +# coding: utf-8 + +from typing import Dict, Tuple + import torch +from coqpit import Coqpit from torch import nn from TTS.tts.layers.tacotron.gst_layers import GST from TTS.tts.layers.tacotron.tacotron2 import Decoder, Encoder, Postnet -from TTS.tts.models.tacotron_abstract import TacotronAbstract +from TTS.tts.models.base_tacotron import BaseTacotron +from TTS.tts.utils.measures import alignment_diagonal_score +from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from TTS.utils.audio import AudioProcessor -# TODO: match function arguments with tacotron -class Tacotron2(TacotronAbstract): +class Tacotron2(BaseTacotron): """Tacotron2 as in https://arxiv.org/abs/1712.05884 - - It's an autoregressive encoder-attention-decoder-postnet architecture. - - Args: - num_chars (int): number of input characters to define the size of embedding layer. - num_speakers (int): number of speakers in the dataset. >1 enables multi-speaker training and model learns speaker embeddings. - r (int): initial model reduction rate. - postnet_output_dim (int, optional): postnet output channels. Defaults to 80. - decoder_output_dim (int, optional): decoder output channels. Defaults to 80. - attn_type (str, optional): attention type. Check ```TTS.tts.layers.tacotron.common_layers.init_attn```. Defaults to 'original'. - attn_win (bool, optional): enable/disable attention windowing. - It especially useful at inference to keep attention alignment diagonal. Defaults to False. - attn_norm (str, optional): Attention normalization method. "sigmoid" or "softmax". Defaults to "softmax". - prenet_type (str, optional): prenet type for the decoder. Defaults to "original". - prenet_dropout (bool, optional): prenet dropout rate. Defaults to True. - prenet_dropout_at_inference (bool, optional): use dropout at inference time. This leads to a better quality for - some models. Defaults to False. - forward_attn (bool, optional): enable/disable forward attention. - It is only valid if ```attn_type``` is ```original```. Defaults to False. - trans_agent (bool, optional): enable/disable transition agent in forward attention. Defaults to False. - forward_attn_mask (bool, optional): enable/disable extra masking over forward attention. Defaults to False. - location_attn (bool, optional): enable/disable location sensitive attention. - It is only valid if ```attn_type``` is ```original```. Defaults to True. - attn_K (int, optional): Number of attention heads for GMM attention. Defaults to 5. - separate_stopnet (bool, optional): enable/disable separate stopnet training without only gradient - flow from stopnet to the rest of the model. Defaults to True. - bidirectional_decoder (bool, optional): enable/disable bidirectional decoding. Defaults to False. - double_decoder_consistency (bool, optional): enable/disable double decoder consistency. Defaults to False. - ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None. - encoder_in_features (int, optional): input channels for the encoder. Defaults to 512. - decoder_in_features (int, optional): input channels for the decoder. Defaults to 512. - speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None. - use_gst (bool, optional): enable/disable Global style token module. - gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. + Check `TacotronConfig` for the arguments. """ - def __init__( - self, - num_chars, - num_speakers, - r, - postnet_output_dim=80, - decoder_output_dim=80, - attn_type="original", - attn_win=False, - attn_norm="softmax", - prenet_type="original", - prenet_dropout=True, - prenet_dropout_at_inference=False, - forward_attn=False, - trans_agent=False, - forward_attn_mask=False, - location_attn=True, - attn_K=5, - separate_stopnet=True, - bidirectional_decoder=False, - double_decoder_consistency=False, - ddc_r=None, - encoder_in_features=512, - decoder_in_features=512, - speaker_embedding_dim=None, - use_gst=False, - gst=None, - ): - super().__init__( - num_chars, - num_speakers, - r, - postnet_output_dim, - decoder_output_dim, - attn_type, - attn_win, - attn_norm, - prenet_type, - prenet_dropout, - prenet_dropout_at_inference, - forward_attn, - trans_agent, - forward_attn_mask, - location_attn, - attn_K, - separate_stopnet, - bidirectional_decoder, - double_decoder_consistency, - ddc_r, - encoder_in_features, - decoder_in_features, - speaker_embedding_dim, - use_gst, - gst, - ) + def __init__(self, config: Coqpit): + super().__init__(config) + + chars, self.config = self.get_characters(config) + self.num_chars = len(chars) + self.decoder_output_dim = config.out_channels + + # pass all config fields to `self` + # for fewer code change + for key in config: + setattr(self, key, config[key]) # speaker embedding layer if self.num_speakers > 1: - if not self.embeddings_per_sample: - speaker_embedding_dim = 512 - self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) - self.speaker_embedding.weight.data.normal_(0, 0.3) + self.init_multispeaker(config) # speaker and gst embeddings is concat in decoder input if self.num_speakers > 1: - self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim + self.decoder_in_features += self.embedded_speaker_dim # add speaker embedding dim + + if self.use_gst: + self.decoder_in_features += self.gst.gst_embedding_dim # embedding layer - self.embedding = nn.Embedding(num_chars, 512, padding_idx=0) + self.embedding = nn.Embedding(self.num_chars, 512, padding_idx=0) # base model layers self.encoder = Encoder(self.encoder_in_features) self.decoder = Decoder( self.decoder_in_features, self.decoder_output_dim, - r, - attn_type, - attn_win, - attn_norm, - prenet_type, - prenet_dropout, - forward_attn, - trans_agent, - forward_attn_mask, - location_attn, - attn_K, - separate_stopnet, + self.r, + self.attention_type, + self.attention_win, + self.attention_norm, + self.prenet_type, + self.prenet_dropout, + self.use_forward_attn, + self.transition_agent, + self.forward_attn_mask, + self.location_attn, + self.attention_heads, + self.separate_stopnet, + self.max_decoder_steps, ) - self.postnet = Postnet(self.postnet_output_dim) + self.postnet = Postnet(self.out_channels) # setup prenet dropout - self.decoder.prenet.dropout_at_inference = prenet_dropout_at_inference + self.decoder.prenet.dropout_at_inference = self.prenet_dropout_at_inference # global style token layers - if self.gst and use_gst: + if self.gst and self.use_gst: self.gst_layer = GST( - num_mel=decoder_output_dim, - speaker_embedding_dim=speaker_embedding_dim, - num_heads=gst.gst_num_heads, - num_style_tokens=gst.gst_num_style_tokens, - gst_embedding_dim=gst.gst_embedding_dim, + num_mel=self.decoder_output_dim, + d_vector_dim=self.d_vector_dim + if self.config.gst.gst_use_speaker_embedding and self.use_speaker_embedding + else None, + num_heads=self.gst.gst_num_heads, + num_style_tokens=self.gst.gst_num_style_tokens, + gst_embedding_dim=self.gst.gst_embedding_dim, ) # backward pass decoder @@ -156,18 +89,19 @@ class Tacotron2(TacotronAbstract): self.coarse_decoder = Decoder( self.decoder_in_features, self.decoder_output_dim, - ddc_r, - attn_type, - attn_win, - attn_norm, - prenet_type, - prenet_dropout, - forward_attn, - trans_agent, - forward_attn_mask, - location_attn, - attn_K, - separate_stopnet, + self.ddc_r, + self.attention_type, + self.attention_win, + self.attention_norm, + self.prenet_type, + self.prenet_dropout, + self.use_forward_attn, + self.transition_agent, + self.forward_attn_mask, + self.location_attn, + self.attention_heads, + self.separate_stopnet, + self.max_decoder_steps, ) @staticmethod @@ -176,16 +110,17 @@ class Tacotron2(TacotronAbstract): mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2) return mel_outputs, mel_outputs_postnet, alignments - def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None, speaker_embeddings=None): + def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, aux_input=None): """ Shapes: text: [B, T_in] text_lengths: [B] mel_specs: [B, T_out, C] mel_lengths: [B] - speaker_ids: [B, 1] - speaker_embeddings: [B, C] + aux_input: 'speaker_ids': [B, 1] and 'd_vectors':[B, C] """ + aux_input = self._format_aux_input(aux_input) + outputs = {"alignments_backward": None, "decoder_outputs_backward": None} # compute mask for padding # B x T_in_max (boolean) input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) @@ -195,15 +130,17 @@ class Tacotron2(TacotronAbstract): encoder_outputs = self.encoder(embedded_inputs, text_lengths) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, speaker_embeddings) + encoder_outputs = self.compute_gst( + encoder_outputs, mel_specs, aux_input["d_vectors"] if "d_vectors" in aux_input else None + ) if self.num_speakers > 1: - if not self.embeddings_per_sample: + if not self.use_d_vectors: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])[:, None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) @@ -222,67 +159,140 @@ class Tacotron2(TacotronAbstract): decoder_outputs, postnet_outputs, alignments = self.shape_outputs(decoder_outputs, postnet_outputs, alignments) if self.bidirectional_decoder: decoder_outputs_backward, alignments_backward = self._backward_pass(mel_specs, encoder_outputs, input_mask) - return ( - decoder_outputs, - postnet_outputs, - alignments, - stop_tokens, - decoder_outputs_backward, - alignments_backward, - ) + outputs["alignments_backward"] = alignments_backward + outputs["decoder_outputs_backward"] = decoder_outputs_backward if self.double_decoder_consistency: decoder_outputs_backward, alignments_backward = self._coarse_decoder_pass( mel_specs, encoder_outputs, alignments, input_mask ) - return ( - decoder_outputs, - postnet_outputs, - alignments, - stop_tokens, - decoder_outputs_backward, - alignments_backward, - ) - return decoder_outputs, postnet_outputs, alignments, stop_tokens + outputs["alignments_backward"] = alignments_backward + outputs["decoder_outputs_backward"] = decoder_outputs_backward + outputs.update( + { + "model_outputs": postnet_outputs, + "decoder_outputs": decoder_outputs, + "alignments": alignments, + "stop_tokens": stop_tokens, + } + ) + return outputs @torch.no_grad() - def inference(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None): + def inference(self, text, aux_input=None): + aux_input = self._format_aux_input(aux_input) embedded_inputs = self.embedding(text).transpose(1, 2) encoder_outputs = self.encoder.inference(embedded_inputs) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel, speaker_embeddings) + encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"]) if self.num_speakers > 1: - if not self.embeddings_per_sample: - speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] - speaker_embeddings = torch.unsqueeze(speaker_embeddings, 0).transpose(1, 2) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + if not self.use_d_vectors: + embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])[None] + # reshape embedded_speakers + if embedded_speakers.ndim == 1: + embedded_speakers = embedded_speakers[None, None, :] + elif embedded_speakers.ndim == 2: + embedded_speakers = embedded_speakers[None, :] + else: + embedded_speakers = aux_input["d_vectors"] + + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = decoder_outputs + postnet_outputs decoder_outputs, postnet_outputs, alignments = self.shape_outputs(decoder_outputs, postnet_outputs, alignments) - return decoder_outputs, postnet_outputs, alignments, stop_tokens + outputs = { + "model_outputs": postnet_outputs, + "decoder_outputs": decoder_outputs, + "alignments": alignments, + "stop_tokens": stop_tokens, + } + return outputs - def inference_truncated(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None): + def train_step(self, batch, criterion): + """Perform a single training step by fetching the right set if samples from the batch. + + Args: + batch ([type]): [description] + criterion ([type]): [description] """ - Preserve model states for continuous inference - """ - embedded_inputs = self.embedding(text).transpose(1, 2) - encoder_outputs = self.encoder.inference_truncated(embedded_inputs) + text_input = batch["text_input"] + text_lengths = batch["text_lengths"] + mel_input = batch["mel_input"] + mel_lengths = batch["mel_lengths"] + linear_input = batch["linear_input"] + stop_targets = batch["stop_targets"] + speaker_ids = batch["speaker_ids"] + d_vectors = batch["d_vectors"] - if self.gst: - # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel, speaker_embeddings) + # forward pass model + outputs = self.forward( + text_input, + text_lengths, + mel_input, + mel_lengths, + aux_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors}, + ) - if self.num_speakers > 1: - if not self.embeddings_per_sample: - speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] - speaker_embeddings = torch.unsqueeze(speaker_embeddings, 0).transpose(1, 2) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + # set the [alignment] lengths wrt reduction factor for guided attention + if mel_lengths.max() % self.decoder.r != 0: + alignment_lengths = ( + mel_lengths + (self.decoder.r - (mel_lengths.max() % self.decoder.r)) + ) // self.decoder.r + else: + alignment_lengths = mel_lengths // self.decoder.r - mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated(encoder_outputs) - mel_outputs_postnet = self.postnet(mel_outputs) - mel_outputs_postnet = mel_outputs + mel_outputs_postnet - mel_outputs, mel_outputs_postnet, alignments = self.shape_outputs(mel_outputs, mel_outputs_postnet, alignments) - return mel_outputs, mel_outputs_postnet, alignments, stop_tokens + aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input) + + # compute loss + loss_dict = criterion( + outputs["model_outputs"], + outputs["decoder_outputs"], + mel_input, + linear_input, + outputs["stop_tokens"], + stop_targets, + mel_lengths, + outputs["decoder_outputs_backward"], + outputs["alignments"], + alignment_lengths, + outputs["alignments_backward"], + text_lengths, + ) + + # compute alignment error (the lower the better ) + align_error = 1 - alignment_diagonal_score(outputs["alignments"]) + loss_dict["align_error"] = align_error + return outputs, loss_dict + + def train_log(self, ap: AudioProcessor, batch: dict, outputs: dict) -> Tuple[Dict, Dict]: + postnet_outputs = outputs["model_outputs"] + alignments = outputs["alignments"] + alignments_backward = outputs["alignments_backward"] + mel_input = batch["mel_input"] + + pred_spec = postnet_outputs[0].data.cpu().numpy() + gt_spec = mel_input[0].data.cpu().numpy() + align_img = alignments[0].data.cpu().numpy() + + figures = { + "prediction": plot_spectrogram(pred_spec, ap, output_fig=False), + "ground_truth": plot_spectrogram(gt_spec, ap, output_fig=False), + "alignment": plot_alignment(align_img, output_fig=False), + } + + if self.bidirectional_decoder or self.double_decoder_consistency: + figures["alignment_backward"] = plot_alignment(alignments_backward[0].data.cpu().numpy(), output_fig=False) + + # Sample audio + train_audio = ap.inv_melspectrogram(pred_spec.T) + return figures, {"audio": train_audio} + + def eval_step(self, batch, criterion): + return self.train_step(batch, criterion) + + def eval_log(self, ap, batch, outputs): + return self.train_log(ap, batch, outputs) diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py deleted file mode 100644 index e684ce7c..00000000 --- a/TTS/tts/models/tacotron_abstract.py +++ /dev/null @@ -1,218 +0,0 @@ -import copy -from abc import ABC, abstractmethod - -import torch -from torch import nn - -from TTS.tts.utils.generic_utils import sequence_mask - - -class TacotronAbstract(ABC, nn.Module): - def __init__( - self, - num_chars, - num_speakers, - r, - postnet_output_dim=80, - decoder_output_dim=80, - attn_type="original", - attn_win=False, - attn_norm="softmax", - prenet_type="original", - prenet_dropout=True, - prenet_dropout_at_inference=False, - forward_attn=False, - trans_agent=False, - forward_attn_mask=False, - location_attn=True, - attn_K=5, - separate_stopnet=True, - bidirectional_decoder=False, - double_decoder_consistency=False, - ddc_r=None, - encoder_in_features=512, - decoder_in_features=512, - speaker_embedding_dim=None, - use_gst=False, - gst=None, - ): - """Abstract Tacotron class""" - super().__init__() - self.num_chars = num_chars - self.r = r - self.decoder_output_dim = decoder_output_dim - self.postnet_output_dim = postnet_output_dim - self.use_gst = use_gst - self.gst = gst - self.num_speakers = num_speakers - self.bidirectional_decoder = bidirectional_decoder - self.double_decoder_consistency = double_decoder_consistency - self.ddc_r = ddc_r - self.attn_type = attn_type - self.attn_win = attn_win - self.attn_norm = attn_norm - self.prenet_type = prenet_type - self.prenet_dropout = prenet_dropout - self.prenet_dropout_at_inference = prenet_dropout_at_inference - self.forward_attn = forward_attn - self.trans_agent = trans_agent - self.forward_attn_mask = forward_attn_mask - self.location_attn = location_attn - self.attn_K = attn_K - self.separate_stopnet = separate_stopnet - self.encoder_in_features = encoder_in_features - self.decoder_in_features = decoder_in_features - self.speaker_embedding_dim = speaker_embedding_dim - - # layers - self.embedding = None - self.encoder = None - self.decoder = None - self.postnet = None - - # multispeaker - if self.speaker_embedding_dim is None: - # if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim - self.embeddings_per_sample = False - else: - # if speaker_embedding_dim is not None we need use speaker embedding per sample - self.embeddings_per_sample = True - - # global style token - if self.gst and use_gst: - self.decoder_in_features += self.gst.gst_embedding_dim # add gst embedding dim - self.gst_layer = None - - # model states - self.speaker_embeddings = None - self.speaker_embeddings_projected = None - - # additional layers - self.decoder_backward = None - self.coarse_decoder = None - - ############################# - # INIT FUNCTIONS - ############################# - - def _init_states(self): - self.speaker_embeddings = None - self.speaker_embeddings_projected = None - - def _init_backward_decoder(self): - self.decoder_backward = copy.deepcopy(self.decoder) - - def _init_coarse_decoder(self): - self.coarse_decoder = copy.deepcopy(self.decoder) - self.coarse_decoder.r_init = self.ddc_r - self.coarse_decoder.set_r(self.ddc_r) - - ############################# - # CORE FUNCTIONS - ############################# - - @abstractmethod - def forward(self): - pass - - @abstractmethod - def inference(self): - pass - - def load_checkpoint( - self, config, checkpoint_path, eval=False - ): # pylint: disable=unused-argument, redefined-builtin - state = torch.load(checkpoint_path, map_location=torch.device("cpu")) - self.load_state_dict(state["model"]) - self.decoder.set_r(state["r"]) - if eval: - self.eval() - assert not self.training - - ############################# - # COMMON COMPUTE FUNCTIONS - ############################# - - def compute_masks(self, text_lengths, mel_lengths): - """Compute masks against sequence paddings.""" - # B x T_in_max (boolean) - device = text_lengths.device - input_mask = sequence_mask(text_lengths).to(device) - output_mask = None - if mel_lengths is not None: - max_len = mel_lengths.max() - r = self.decoder.r - max_len = max_len + (r - (max_len % r)) if max_len % r > 0 else max_len - output_mask = sequence_mask(mel_lengths, max_len=max_len).to(device) - return input_mask, output_mask - - def _backward_pass(self, mel_specs, encoder_outputs, mask): - """Run backwards decoder""" - decoder_outputs_b, alignments_b, _ = self.decoder_backward( - encoder_outputs, torch.flip(mel_specs, dims=(1,)), mask - ) - decoder_outputs_b = decoder_outputs_b.transpose(1, 2).contiguous() - return decoder_outputs_b, alignments_b - - def _coarse_decoder_pass(self, mel_specs, encoder_outputs, alignments, input_mask): - """Double Decoder Consistency""" - T = mel_specs.shape[1] - if T % self.coarse_decoder.r > 0: - padding_size = self.coarse_decoder.r - (T % self.coarse_decoder.r) - mel_specs = torch.nn.functional.pad(mel_specs, (0, 0, 0, padding_size, 0, 0)) - decoder_outputs_backward, alignments_backward, _ = self.coarse_decoder( - encoder_outputs.detach(), mel_specs, input_mask - ) - # scale_factor = self.decoder.r_init / self.decoder.r - alignments_backward = torch.nn.functional.interpolate( - alignments_backward.transpose(1, 2), size=alignments.shape[1], mode="nearest" - ).transpose(1, 2) - decoder_outputs_backward = decoder_outputs_backward.transpose(1, 2) - decoder_outputs_backward = decoder_outputs_backward[:, :T, :] - return decoder_outputs_backward, alignments_backward - - ############################# - # EMBEDDING FUNCTIONS - ############################# - - def compute_speaker_embedding(self, speaker_ids): - """Compute speaker embedding vectors""" - if hasattr(self, "speaker_embedding") and speaker_ids is None: - raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided") - if hasattr(self, "speaker_embedding") and speaker_ids is not None: - self.speaker_embeddings = self.speaker_embedding(speaker_ids).unsqueeze(1) - if hasattr(self, "speaker_project_mel") and speaker_ids is not None: - self.speaker_embeddings_projected = self.speaker_project_mel(self.speaker_embeddings).squeeze(1) - - def compute_gst(self, inputs, style_input, speaker_embedding=None): - """Compute global style token""" - device = inputs.device - if isinstance(style_input, dict): - query = torch.zeros(1, 1, self.gst.gst_embedding_dim // 2).to(device) - if speaker_embedding is not None: - query = torch.cat([query, speaker_embedding.reshape(1, 1, -1)], dim=-1) - - _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) - gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).to(device) - for k_token, v_amplifier in style_input.items(): - key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) - gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) - gst_outputs = gst_outputs + gst_outputs_att * v_amplifier - elif style_input is None: - gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).to(device) - else: - gst_outputs = self.gst_layer(style_input, speaker_embedding) # pylint: disable=not-callable - inputs = self._concat_speaker_embedding(inputs, gst_outputs) - return inputs - - @staticmethod - def _add_speaker_embedding(outputs, speaker_embeddings): - speaker_embeddings_ = speaker_embeddings.expand(outputs.size(0), outputs.size(1), -1) - outputs = outputs + speaker_embeddings_ - return outputs - - @staticmethod - def _concat_speaker_embedding(outputs, speaker_embeddings): - speaker_embeddings_ = speaker_embeddings.expand(outputs.size(0), outputs.size(1), -1) - outputs = torch.cat([outputs, speaker_embeddings_], dim=-1) - return outputs diff --git a/TTS/tts/tf/models/tacotron2.py b/TTS/tts/tf/models/tacotron2.py index 9cc62070..7a1d695d 100644 --- a/TTS/tts/tf/models/tacotron2.py +++ b/TTS/tts/tf/models/tacotron2.py @@ -12,7 +12,7 @@ class Tacotron2(keras.models.Model): num_chars, num_speakers, r, - postnet_output_dim=80, + out_channels=80, decoder_output_dim=80, attn_type="original", attn_win=False, @@ -31,7 +31,7 @@ class Tacotron2(keras.models.Model): super().__init__() self.r = r self.decoder_output_dim = decoder_output_dim - self.postnet_output_dim = postnet_output_dim + self.out_channels = out_channels self.bidirectional_decoder = bidirectional_decoder self.num_speakers = num_speakers self.speaker_embed_dim = 256 @@ -58,7 +58,7 @@ class Tacotron2(keras.models.Model): name="decoder", enable_tflite=enable_tflite, ) - self.postnet = Postnet(postnet_output_dim, 5, name="postnet") + self.postnet = Postnet(out_channels, 5, name="postnet") @tf.function(experimental_relax_shapes=True) def call(self, characters, text_lengths=None, frames=None, training=None): diff --git a/TTS/tts/tf/utils/generic_utils.py b/TTS/tts/tf/utils/generic_utils.py index 5b8b4ce2..91434a38 100644 --- a/TTS/tts/tf/utils/generic_utils.py +++ b/TTS/tts/tf/utils/generic_utils.py @@ -44,8 +44,7 @@ def sequence_mask(sequence_length, max_len=None): batch_size = sequence_length.size(0) seq_range = np.empty([0, max_len], dtype=np.int8) seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) - if sequence_length.is_cuda: - seq_range_expand = seq_range_expand.cuda() + seq_range_expand = seq_range_expand.type_as(sequence_length) seq_length_expand = sequence_length.unsqueeze(1).expand_as(seq_range_expand) # B x T_max return seq_range_expand < seq_length_expand @@ -84,7 +83,7 @@ def setup_model(num_chars, num_speakers, c, enable_tflite=False): num_chars=num_chars, num_speakers=num_speakers, r=c.r, - postnet_output_dim=c.audio["num_mels"], + out_channels=c.audio["num_mels"], decoder_output_dim=c.audio["num_mels"], attn_type=c.attention_type, attn_win=c.windowing, diff --git a/TTS/tts/utils/data.py b/TTS/tts/utils/data.py index 259a32d9..3ff52195 100644 --- a/TTS/tts/utils/data.py +++ b/TTS/tts/utils/data.py @@ -1,4 +1,5 @@ import numpy as np +import torch def _pad_data(x, length): @@ -65,3 +66,12 @@ class StandardScaler: X *= self.scale_ X += self.mean_ return X + + +# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1 +def sequence_mask(sequence_length, max_len=None): + if max_len is None: + max_len = sequence_length.data.max() + seq_range = torch.arange(max_len, dtype=sequence_length.dtype, device=sequence_length.device) + # B x T_max + return seq_range.unsqueeze(0) < sequence_length.unsqueeze(1) diff --git a/TTS/tts/utils/generic_utils.py b/TTS/tts/utils/generic_utils.py deleted file mode 100644 index b0e53f33..00000000 --- a/TTS/tts/utils/generic_utils.py +++ /dev/null @@ -1,278 +0,0 @@ -import torch - -from TTS.utils.generic_utils import find_module - - -# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1 -def sequence_mask(sequence_length, max_len=None): - if max_len is None: - max_len = sequence_length.data.max() - seq_range = torch.arange(max_len, dtype=sequence_length.dtype, device=sequence_length.device) - # B x T_max - return seq_range.unsqueeze(0) < sequence_length.unsqueeze(1) - - -def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): - print(" > Using model: {}".format(c.model)) - MyModel = find_module("TTS.tts.models", c.model.lower()) - if c.model.lower() in "tacotron": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - num_speakers=num_speakers, - r=c.r, - postnet_output_dim=int(c.audio["fft_size"] / 2 + 1), - decoder_output_dim=c.audio["num_mels"], - use_gst=c.use_gst, - gst=c.gst, - memory_size=c.memory_size, - attn_type=c.attention_type, - attn_win=c.windowing, - attn_norm=c.attention_norm, - prenet_type=c.prenet_type, - prenet_dropout=c.prenet_dropout, - prenet_dropout_at_inference=c.prenet_dropout_at_inference, - forward_attn=c.use_forward_attn, - trans_agent=c.transition_agent, - forward_attn_mask=c.forward_attn_mask, - location_attn=c.location_attn, - attn_K=c.attention_heads, - separate_stopnet=c.separate_stopnet, - bidirectional_decoder=c.bidirectional_decoder, - double_decoder_consistency=c.double_decoder_consistency, - ddc_r=c.ddc_r, - speaker_embedding_dim=speaker_embedding_dim, - ) - elif c.model.lower() == "tacotron2": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - num_speakers=num_speakers, - r=c.r, - postnet_output_dim=c.audio["num_mels"], - decoder_output_dim=c.audio["num_mels"], - use_gst=c.use_gst, - gst=c.gst, - attn_type=c.attention_type, - attn_win=c.windowing, - attn_norm=c.attention_norm, - prenet_type=c.prenet_type, - prenet_dropout=c.prenet_dropout, - prenet_dropout_at_inference=c.prenet_dropout_at_inference, - forward_attn=c.use_forward_attn, - trans_agent=c.transition_agent, - forward_attn_mask=c.forward_attn_mask, - location_attn=c.location_attn, - attn_K=c.attention_heads, - separate_stopnet=c.separate_stopnet, - bidirectional_decoder=c.bidirectional_decoder, - double_decoder_consistency=c.double_decoder_consistency, - ddc_r=c.ddc_r, - speaker_embedding_dim=speaker_embedding_dim, - ) - elif c.model.lower() == "glow_tts": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - hidden_channels_enc=c["hidden_channels_encoder"], - hidden_channels_dec=c["hidden_channels_decoder"], - hidden_channels_dp=c["hidden_channels_duration_predictor"], - out_channels=c.audio["num_mels"], - encoder_type=c.encoder_type, - encoder_params=c.encoder_params, - use_encoder_prenet=c["use_encoder_prenet"], - inference_noise_scale=c.inference_noise_scale, - num_flow_blocks_dec=12, - kernel_size_dec=5, - dilation_rate=1, - num_block_layers=4, - dropout_p_dec=0.05, - num_speakers=num_speakers, - c_in_channels=0, - num_splits=4, - num_squeeze=2, - sigmoid_scale=False, - mean_only=True, - speaker_embedding_dim=speaker_embedding_dim, - ) - elif c.model.lower() == "speedy_speech": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - out_channels=c.audio["num_mels"], - hidden_channels=c["hidden_channels"], - positional_encoding=c["positional_encoding"], - encoder_type=c["encoder_type"], - encoder_params=c["encoder_params"], - decoder_type=c["decoder_type"], - decoder_params=c["decoder_params"], - c_in_channels=0, - ) - elif c.model.lower() == "align_tts": - model = MyModel( - num_chars=num_chars + getattr(c, "add_blank", False), - out_channels=c.audio["num_mels"], - hidden_channels=c["hidden_channels"], - hidden_channels_dp=c["hidden_channels_dp"], - encoder_type=c["encoder_type"], - encoder_params=c["encoder_params"], - decoder_type=c["decoder_type"], - decoder_params=c["decoder_params"], - c_in_channels=0, - ) - return model - - -def is_tacotron(c): - return "tacotron" in c["model"].lower() - - -# def check_config_tts(c): -# check_argument('model', c, enum_list=['tacotron', 'tacotron2', 'glow_tts', 'speedy_speech', 'align_tts'], restricted=True, val_type=str) -# check_argument('run_name', c, restricted=True, val_type=str) -# check_argument('run_description', c, val_type=str) - -# # AUDIO -# # check_argument('audio', c, restricted=True, val_type=dict) - -# # audio processing parameters -# # check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056) -# # check_argument('fft_size', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058) -# # check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000) -# # check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length') -# # check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length') -# # check_argument('preemphasis', c['audio'], restricted=True, val_type=float, min_val=0, max_val=1) -# # check_argument('min_level_db', c['audio'], restricted=True, val_type=int, min_val=-1000, max_val=10) -# # check_argument('ref_level_db', c['audio'], restricted=True, val_type=int, min_val=0, max_val=1000) -# # check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5) -# # check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000) - -# # vocabulary parameters -# check_argument('characters', c, restricted=False, val_type=dict) -# check_argument('pad', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) -# check_argument('eos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) -# check_argument('bos', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) -# check_argument('characters', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) -# check_argument('phonemes', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys() and c['use_phonemes'], val_type=str) -# check_argument('punctuations', c['characters'] if 'characters' in c.keys() else {}, restricted='characters' in c.keys(), val_type=str) - -# # normalization parameters -# # check_argument('signal_norm', c['audio'], restricted=True, val_type=bool) -# # check_argument('symmetric_norm', c['audio'], restricted=True, val_type=bool) -# # check_argument('max_norm', c['audio'], restricted=True, val_type=float, min_val=0.1, max_val=1000) -# # check_argument('clip_norm', c['audio'], restricted=True, val_type=bool) -# # check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000) -# # check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0) -# # check_argument('spec_gain', c['audio'], restricted=True, val_type=[int, float], min_val=1, max_val=100) -# # check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool) -# # check_argument('trim_db', c['audio'], restricted=True, val_type=int) - -# # training parameters -# # check_argument('batch_size', c, restricted=True, val_type=int, min_val=1) -# # check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1) -# # check_argument('r', c, restricted=True, val_type=int, min_val=1) -# # check_argument('gradual_training', c, restricted=False, val_type=list) -# # check_argument('mixed_precision', c, restricted=False, val_type=bool) -# # check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100) - -# # loss parameters -# # check_argument('loss_masking', c, restricted=True, val_type=bool) -# # if c['model'].lower() in ['tacotron', 'tacotron2']: -# # check_argument('decoder_loss_alpha', c, restricted=True, val_type=float, min_val=0) -# # check_argument('postnet_loss_alpha', c, restricted=True, val_type=float, min_val=0) -# # check_argument('postnet_diff_spec_alpha', c, restricted=True, val_type=float, min_val=0) -# # check_argument('decoder_diff_spec_alpha', c, restricted=True, val_type=float, min_val=0) -# # check_argument('decoder_ssim_alpha', c, restricted=True, val_type=float, min_val=0) -# # check_argument('postnet_ssim_alpha', c, restricted=True, val_type=float, min_val=0) -# # check_argument('ga_alpha', c, restricted=True, val_type=float, min_val=0) -# if c['model'].lower in ["speedy_speech", "align_tts"]: -# check_argument('ssim_alpha', c, restricted=True, val_type=float, min_val=0) -# check_argument('l1_alpha', c, restricted=True, val_type=float, min_val=0) -# check_argument('huber_alpha', c, restricted=True, val_type=float, min_val=0) - -# # validation parameters -# # check_argument('run_eval', c, restricted=True, val_type=bool) -# # check_argument('test_delay_epochs', c, restricted=True, val_type=int, min_val=0) -# # check_argument('test_sentences_file', c, restricted=False, val_type=str) - -# # optimizer -# check_argument('noam_schedule', c, restricted=False, val_type=bool) -# check_argument('grad_clip', c, restricted=True, val_type=float, min_val=0.0) -# check_argument('epochs', c, restricted=True, val_type=int, min_val=1) -# check_argument('lr', c, restricted=True, val_type=float, min_val=0) -# check_argument('wd', c, restricted=is_tacotron(c), val_type=float, min_val=0) -# check_argument('warmup_steps', c, restricted=True, val_type=int, min_val=0) -# check_argument('seq_len_norm', c, restricted=is_tacotron(c), val_type=bool) - -# # tacotron prenet -# # check_argument('memory_size', c, restricted=is_tacotron(c), val_type=int, min_val=-1) -# # check_argument('prenet_type', c, restricted=is_tacotron(c), val_type=str, enum_list=['original', 'bn']) -# # check_argument('prenet_dropout', c, restricted=is_tacotron(c), val_type=bool) - -# # attention -# check_argument('attention_type', c, restricted=is_tacotron(c), val_type=str, enum_list=['graves', 'original', 'dynamic_convolution']) -# check_argument('attention_heads', c, restricted=is_tacotron(c), val_type=int) -# check_argument('attention_norm', c, restricted=is_tacotron(c), val_type=str, enum_list=['sigmoid', 'softmax']) -# check_argument('windowing', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('use_forward_attn', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('forward_attn_mask', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('transition_agent', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('transition_agent', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('location_attn', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('bidirectional_decoder', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('double_decoder_consistency', c, restricted=is_tacotron(c), val_type=bool) -# check_argument('ddc_r', c, restricted='double_decoder_consistency' in c.keys(), min_val=1, max_val=7, val_type=int) - -# if c['model'].lower() in ['tacotron', 'tacotron2']: -# # stopnet -# # check_argument('stopnet', c, restricted=is_tacotron(c), val_type=bool) -# # check_argument('separate_stopnet', c, restricted=is_tacotron(c), val_type=bool) - -# # Model Parameters for non-tacotron models -# if c['model'].lower in ["speedy_speech", "align_tts"]: -# check_argument('positional_encoding', c, restricted=True, val_type=type) -# check_argument('encoder_type', c, restricted=True, val_type=str) -# check_argument('encoder_params', c, restricted=True, val_type=dict) -# check_argument('decoder_residual_conv_bn_params', c, restricted=True, val_type=dict) - -# # GlowTTS parameters -# check_argument('encoder_type', c, restricted=not is_tacotron(c), val_type=str) - -# # tensorboard -# # check_argument('print_step', c, restricted=True, val_type=int, min_val=1) -# # check_argument('tb_plot_step', c, restricted=True, val_type=int, min_val=1) -# # check_argument('save_step', c, restricted=True, val_type=int, min_val=1) -# # check_argument('checkpoint', c, restricted=True, val_type=bool) -# # check_argument('tb_model_param_stats', c, restricted=True, val_type=bool) - -# # dataloading -# # pylint: disable=import-outside-toplevel -# from TTS.tts.utils.text import cleaners -# # check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=dir(cleaners)) -# # check_argument('enable_eos_bos_chars', c, restricted=True, val_type=bool) -# # check_argument('num_loader_workers', c, restricted=True, val_type=int, min_val=0) -# # check_argument('num_val_loader_workers', c, restricted=True, val_type=int, min_val=0) -# # check_argument('batch_group_size', c, restricted=True, val_type=int, min_val=0) -# # check_argument('min_seq_len', c, restricted=True, val_type=int, min_val=0) -# # check_argument('max_seq_len', c, restricted=True, val_type=int, min_val=10) -# # check_argument('compute_input_seq_cache', c, restricted=True, val_type=bool) - -# # paths -# # check_argument('output_path', c, restricted=True, val_type=str) - -# # multi-speaker and gst -# # check_argument('use_speaker_embedding', c, restricted=True, val_type=bool) -# # check_argument('use_external_speaker_embedding_file', c, restricted=c['use_speaker_embedding'], val_type=bool) -# # check_argument('external_speaker_embedding_file', c, restricted=c['use_external_speaker_embedding_file'], val_type=str) -# if c['model'].lower() in ['tacotron', 'tacotron2'] and c['use_gst']: -# # check_argument('use_gst', c, restricted=is_tacotron(c), val_type=bool) -# # check_argument('gst', c, restricted=is_tacotron(c), val_type=dict) -# # check_argument('gst_style_input', c['gst'], restricted=is_tacotron(c), val_type=[str, dict]) -# # check_argument('gst_embedding_dim', c['gst'], restricted=is_tacotron(c), val_type=int, min_val=0, max_val=1000) -# # check_argument('gst_use_speaker_embedding', c['gst'], restricted=is_tacotron(c), val_type=bool) -# # check_argument('gst_num_heads', c['gst'], restricted=is_tacotron(c), val_type=int, min_val=2, max_val=10) -# # check_argument('gst_num_style_tokens', c['gst'], restricted=is_tacotron(c), val_type=int, min_val=1, max_val=1000) - -# # datasets - checking only the first entry -# # check_argument('datasets', c, restricted=True, val_type=list) -# # for dataset_entry in c['datasets']: -# # check_argument('name', dataset_entry, restricted=True, val_type=str) -# # check_argument('path', dataset_entry, restricted=True, val_type=str) -# # check_argument('meta_file_train', dataset_entry, restricted=True, val_type=[str, list]) -# # check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str) diff --git a/TTS/tts/utils/io.py b/TTS/tts/utils/io.py deleted file mode 100644 index bb8432fa..00000000 --- a/TTS/tts/utils/io.py +++ /dev/null @@ -1,120 +0,0 @@ -import datetime -import os -import pickle as pickle_tts - -import torch - -from TTS.utils.io import RenamingUnpickler - - -def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False, eval=False): # pylint: disable=redefined-builtin - """Load ```TTS.tts.models``` checkpoints. - - Args: - model (TTS.tts.models): model object to load the weights for. - checkpoint_path (string): checkpoint file path. - amp (apex.amp, optional): Apex amp abject to load apex related state vars. Defaults to None. - use_cuda (bool, optional): load model to GPU if True. Defaults to False. - - Returns: - [type]: [description] - """ - try: - state = torch.load(checkpoint_path, map_location=torch.device("cpu")) - except ModuleNotFoundError: - pickle_tts.Unpickler = RenamingUnpickler - state = torch.load(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts) - model.load_state_dict(state["model"]) - if amp and "amp" in state: - amp.load_state_dict(state["amp"]) - if use_cuda: - model.cuda() - # set model stepsize - if hasattr(model.decoder, "r"): - model.decoder.set_r(state["r"]) - print(" > Model r: ", state["r"]) - if eval: - model.eval() - return model, state - - -def save_model(model, optimizer, current_step, epoch, r, output_path, characters, amp_state_dict=None, **kwargs): - """Save ```TTS.tts.models``` states with extra fields. - - Args: - model (TTS.tts.models.Model): models object to be saved. - optimizer (torch.optim.optimizers.Optimizer): model optimizer used for training. - current_step (int): current number of training steps. - epoch (int): current number of training epochs. - r (int): model reduction rate for Tacotron models. - output_path (str): output path to save the model file. - characters (list): list of characters used in the model. - amp_state_dict (state_dict, optional): Apex.amp state dict if Apex is enabled. Defaults to None. - """ - if hasattr(model, "module"): - model_state = model.module.state_dict() - else: - model_state = model.state_dict() - state = { - "model": model_state, - "optimizer": optimizer.state_dict() if optimizer is not None else None, - "step": current_step, - "epoch": epoch, - "date": datetime.date.today().strftime("%B %d, %Y"), - "r": r, - "characters": characters, - } - if amp_state_dict: - state["amp"] = amp_state_dict - state.update(kwargs) - torch.save(state, output_path) - - -def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, characters, **kwargs): - """Save model checkpoint, intended for saving checkpoints at training. - - Args: - model (TTS.tts.models.Model): models object to be saved. - optimizer (torch.optim.optimizers.Optimizer): model optimizer used for training. - current_step (int): current number of training steps. - epoch (int): current number of training epochs. - r (int): model reduction rate for Tacotron models. - output_path (str): output path to save the model file. - characters (list): list of characters used in the model. - """ - file_name = "checkpoint_{}.pth.tar".format(current_step) - checkpoint_path = os.path.join(output_folder, file_name) - print(" > CHECKPOINT : {}".format(checkpoint_path)) - save_model(model, optimizer, current_step, epoch, r, checkpoint_path, characters, **kwargs) - - -def save_best_model( - target_loss, best_loss, model, optimizer, current_step, epoch, r, output_folder, characters, **kwargs -): - """Save model checkpoint, intended for saving the best model after each epoch. - It compares the current model loss with the best loss so far and saves the - model if the current loss is better. - - Args: - target_loss (float): current model loss. - best_loss (float): best loss so far. - model (TTS.tts.models.Model): models object to be saved. - optimizer (torch.optim.optimizers.Optimizer): model optimizer used for training. - current_step (int): current number of training steps. - epoch (int): current number of training epochs. - r (int): model reduction rate for Tacotron models. - output_path (str): output path to save the model file. - characters (list): list of characters used in the model. - - Returns: - float: updated current best loss. - """ - if target_loss < best_loss: - file_name = "best_model.pth.tar" - checkpoint_path = os.path.join(output_folder, file_name) - print(" >> BEST MODEL : {}".format(checkpoint_path)) - save_model( - model, optimizer, current_step, epoch, r, checkpoint_path, characters, model_loss=target_loss, **kwargs - ) - best_loss = target_loss - return best_loss diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 1b8c054d..a8c9e0f6 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -1,164 +1,94 @@ import json import os import random -from typing import Union +from typing import Any, Dict, List, Tuple, Union import numpy as np import torch +from coqpit import Coqpit from TTS.config import load_config from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.utils.audio import AudioProcessor -def make_speakers_json_path(out_path): - """Returns conventional speakers.json location.""" - return os.path.join(out_path, "speakers.json") - - -def load_speaker_mapping(out_path): - """Loads speaker mapping if already present.""" - if os.path.splitext(out_path)[1] == ".json": - json_file = out_path - else: - json_file = make_speakers_json_path(out_path) - with open(json_file) as f: - return json.load(f) - - -def save_speaker_mapping(out_path, speaker_mapping): - """Saves speaker mapping if not yet present.""" - if out_path is not None: - speakers_json_path = make_speakers_json_path(out_path) - with open(speakers_json_path, "w") as f: - json.dump(speaker_mapping, f, indent=4) - - -def get_speakers(items): - """Returns a sorted, unique list of speakers in a given dataset.""" - speakers = {e[2] for e in items} - return sorted(speakers) - - -def parse_speakers(c, args, meta_data_train, OUT_PATH): - """Returns number of speakers, speaker embedding shape and speaker mapping""" - if c.use_speaker_embedding: - speakers = get_speakers(meta_data_train) - if args.restore_path: - if c.use_external_speaker_embedding_file: # if restore checkpoint and use External Embedding file - prev_out_path = os.path.dirname(args.restore_path) - speaker_mapping = load_speaker_mapping(prev_out_path) - if not speaker_mapping: - print( - "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" - ) - speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) - if not speaker_mapping: - raise RuntimeError( - "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file" - ) - speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"]) - elif ( - not c.use_external_speaker_embedding_file - ): # if restore checkpoint and don't use External Embedding file - prev_out_path = os.path.dirname(args.restore_path) - speaker_mapping = load_speaker_mapping(prev_out_path) - speaker_embedding_dim = None - assert all(speaker in speaker_mapping for speaker in speakers), ( - "As of now you, you cannot " "introduce new speakers to " "a previously trained model." - ) - elif ( - c.use_external_speaker_embedding_file and c.external_speaker_embedding_file - ): # if start new train using External Embedding file - speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) - speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"]) - elif ( - c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file - ): # if start new train using External Embedding file and don't pass external embedding file - raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" - else: # if start new train and don't use External Embedding file - speaker_mapping = {name: i for i, name in enumerate(speakers)} - speaker_embedding_dim = None - save_speaker_mapping(OUT_PATH, speaker_mapping) - num_speakers = len(speaker_mapping) - print(" > Training with {} speakers: {}".format(len(speakers), ", ".join(speakers))) - else: - num_speakers = 0 - speaker_embedding_dim = None - speaker_mapping = None - - return num_speakers, speaker_embedding_dim, speaker_mapping - - class SpeakerManager: - """It manages the multi-speaker setup for 🐸TTS models. It loads the speaker files and parses the information - in a way that you can query. There are 3 different scenarios considered. + """Manage the speakers for multi-speaker 🐸TTS models. Load a datafile and parse the information + in a way that can be queried by speaker or clip. - 1. Models using speaker embedding layers. The metafile only includes a mapping of speaker names to ids. - 2. Models using external embedding vectors (x vectors). The metafile includes a dictionary in the following - format. + There are 3 different scenarios considered: - ``` - { - 'clip_name.wav':{ - 'name': 'speakerA', - 'embedding'[] - }, - ... - } - ``` + 1. Models using speaker embedding layers. The datafile only maps speaker names to ids used by the embedding layer. + 2. Models using d-vectors. The datafile includes a dictionary in the following format. - 3. Computing x vectors at inference with the speaker encoder. It loads the speaker encoder model and - computes x vectors for a given instance. + :: - >>> >>> # load audio processor and speaker encoder - >>> ap = AudioProcessor(**config.audio) - >>> manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path) - >>> # load a sample audio and compute embedding - >>> waveform = ap.load_wav(sample_wav_path) - >>> mel = ap.melspectrogram(waveform) - >>> x_vector = manager.compute_x_vector(mel.T) + { + 'clip_name.wav':{ + 'name': 'speakerA', + 'embedding'[] + }, + ... + } + + + 3. Computing the d-vectors by the speaker encoder. It loads the speaker encoder model and + computes the d-vectors for a given clip or speaker. Args: - x_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "". - speaker_id_file_path (str, optional): Path to the metafile that maps speaker names to ids used by the - TTS model. Defaults to "". + d_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "". + speaker_id_file_path (str, optional): Path to the metafile that maps speaker names to ids used by + TTS models. Defaults to "". encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "". encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "". + + Examples: + >>> # load audio processor and speaker encoder + >>> ap = AudioProcessor(**config.audio) + >>> manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path) + >>> # load a sample audio and compute embedding + >>> waveform = ap.load_wav(sample_wav_path) + >>> mel = ap.melspectrogram(waveform) + >>> d_vector = manager.compute_d_vector(mel.T) """ def __init__( self, - x_vectors_file_path: str = "", + data_items: List[List[Any]] = None, + d_vectors_file_path: str = "", speaker_id_file_path: str = "", encoder_model_path: str = "", encoder_config_path: str = "", use_cuda: bool = False, ): - self.x_vectors = None - self.speaker_ids = None - self.clip_ids = None + self.data_items = [] + self.d_vectors = {} + self.speaker_ids = {} + self.clip_ids = [] self.speaker_encoder = None self.speaker_encoder_ap = None self.use_cuda = use_cuda - if x_vectors_file_path: - self.load_x_vectors_file(x_vectors_file_path) + if data_items: + self.speaker_ids, self.speaker_names, _ = self.parse_speakers_from_data(self.data_items) + + if d_vectors_file_path: + self.set_d_vectors_from_file(d_vectors_file_path) if speaker_id_file_path: - self.load_ids_file(speaker_id_file_path) + self.set_speaker_ids_from_file(speaker_id_file_path) if encoder_model_path and encoder_config_path: self.init_speaker_encoder(encoder_model_path, encoder_config_path) @staticmethod - def _load_json(json_file_path: str): + def _load_json(json_file_path: str) -> Dict: with open(json_file_path) as f: return json.load(f) @staticmethod - def _save_json(json_file_path: str, data: dict): + def _save_json(json_file_path: str, data: dict) -> None: with open(json_file_path, "w") as f: json.dump(data, f, indent=4) @@ -167,54 +97,131 @@ class SpeakerManager: return len(self.speaker_ids) @property - def x_vector_dim(self): - return len(self.x_vectors[list(self.x_vectors.keys())[0]]["embedding"]) + def speaker_names(self): + return list(self.speaker_ids.keys()) - def parser_speakers_from_items(self, items: list): - speaker_ids = sorted({item[2] for item in items}) - self.speaker_ids = speaker_ids + @property + def d_vector_dim(self): + """Dimensionality of d_vectors. If d_vectors are not loaded, returns zero.""" + if self.d_vectors: + return len(self.d_vectors[list(self.d_vectors.keys())[0]]["embedding"]) + return 0 + + @staticmethod + def parse_speakers_from_data(items: list) -> Tuple[Dict, int]: + """Parse speaker IDs from data samples retured by `load_meta_data()`. + + Args: + items (list): Data sampled returned by `load_meta_data()`. + + Returns: + Tuple[Dict, int]: speaker IDs and number of speakers. + """ + speakers = sorted({item[2] for item in items}) + speaker_ids = {name: i for i, name in enumerate(speakers)} num_speakers = len(speaker_ids) return speaker_ids, num_speakers - def save_ids_file(self, file_path: str): - self._save_json(file_path, self.speaker_ids) + def set_speaker_ids_from_data(self, items: List) -> None: + """Set speaker IDs from data samples. - def load_ids_file(self, file_path: str): + Args: + items (List): Data sampled returned by `load_meta_data()`. + """ + self.speaker_ids, _ = self.parse_speakers_from_data(items) + + def set_speaker_ids_from_file(self, file_path: str) -> None: + """Set speaker IDs from a file. + + Args: + file_path (str): Path to the file. + """ self.speaker_ids = self._load_json(file_path) - def save_x_vectors_file(self, file_path: str): - self._save_json(file_path, self.x_vectors) + def save_speaker_ids_to_file(self, file_path: str) -> None: + """Save speaker IDs to a json file. - def load_x_vectors_file(self, file_path: str): - self.x_vectors = self._load_json(file_path) - self.speaker_ids = list(set(sorted(x["name"] for x in self.x_vectors.values()))) - self.clip_ids = list(set(sorted(clip_name for clip_name in self.x_vectors.keys()))) + Args: + file_path (str): Path to the output file. + """ + self._save_json(file_path, self.speaker_ids) - def get_x_vector_by_clip(self, clip_idx: str): - return self.x_vectors[clip_idx]["embedding"] + def save_d_vectors_to_file(self, file_path: str) -> None: + """Save d_vectors to a json file. - def get_x_vectors_by_speaker(self, speaker_idx: str): - return [x["embedding"] for x in self.x_vectors.values() if x["name"] == speaker_idx] + Args: + file_path (str): Path to the output file. + """ + self._save_json(file_path, self.d_vectors) - def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False): - x_vectors = self.get_x_vectors_by_speaker(speaker_idx) + def set_d_vectors_from_file(self, file_path: str) -> None: + """Load d_vectors from a json file. + + Args: + file_path (str): Path to the target json file. + """ + self.d_vectors = self._load_json(file_path) + speakers = sorted({x["name"] for x in self.d_vectors.values()}) + self.speaker_ids = {name: i for i, name in enumerate(speakers)} + self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys()))) + + def get_d_vector_by_clip(self, clip_idx: str) -> List: + """Get d_vector by clip ID. + + Args: + clip_idx (str): Target clip ID. + + Returns: + List: d_vector as a list. + """ + return self.d_vectors[clip_idx]["embedding"] + + def get_d_vectors_by_speaker(self, speaker_idx: str) -> List[List]: + """Get all d_vectors of a speaker. + + Args: + speaker_idx (str): Target speaker ID. + + Returns: + List[List]: all the d_vectors of the given speaker. + """ + return [x["embedding"] for x in self.d_vectors.values() if x["name"] == speaker_idx] + + def get_mean_d_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: + """Get mean d_vector of a speaker ID. + + Args: + speaker_idx (str): Target speaker ID. + num_samples (int, optional): Number of samples to be averaged. Defaults to None. + randomize (bool, optional): Pick random `num_samples` of d_vectors. Defaults to False. + + Returns: + np.ndarray: Mean d_vector. + """ + d_vectors = self.get_d_vectors_by_speaker(speaker_idx) if num_samples is None: - x_vectors = np.stack(x_vectors).mean(0) + d_vectors = np.stack(d_vectors).mean(0) else: - assert len(x_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}" + assert len(d_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}" if randomize: - x_vectors = np.stack(random.choices(x_vectors, k=num_samples)).mean(0) + d_vectors = np.stack(random.choices(d_vectors, k=num_samples)).mean(0) else: - x_vectors = np.stack(x_vectors[:num_samples]).mean(0) - return x_vectors + d_vectors = np.stack(d_vectors[:num_samples]).mean(0) + return d_vectors - def get_speakers(self): + def get_speakers(self) -> List: return self.speaker_ids - def get_clips(self): - return sorted(self.x_vectors.keys()) + def get_clips(self) -> List: + return sorted(self.d_vectors.keys()) def init_speaker_encoder(self, model_path: str, config_path: str) -> None: + """Initialize a speaker encoder model. + + Args: + model_path (str): Model file path. + config_path (str): Model config file path. + """ self.speaker_encoder_config = load_config(config_path) self.speaker_encoder = setup_model(self.speaker_encoder_config) self.speaker_encoder.load_checkpoint(config_path, model_path, eval=True, use_cuda=self.use_cuda) @@ -223,7 +230,16 @@ class SpeakerManager: # self.speaker_encoder_ap.do_sound_norm = True # self.speaker_encoder_ap.do_trim_silence = True - def compute_x_vector_from_clip(self, wav_file: Union[str, list]) -> list: + def compute_d_vector_from_clip(self, wav_file: Union[str, list]) -> list: + """Compute a d_vector from a given audio file. + + Args: + wav_file (Union[str, list]): Target file path. + + Returns: + list: Computed d_vector. + """ + def _compute(wav_file: str): waveform = self.speaker_encoder_ap.load_wav(wav_file, sr=self.speaker_encoder_ap.sample_rate) spec = self.speaker_encoder_ap.melspectrogram(waveform) @@ -231,23 +247,31 @@ class SpeakerManager: if self.use_cuda: spec = spec.cuda() spec = spec.unsqueeze(0) - x_vector = self.speaker_encoder.compute_embedding(spec) - return x_vector + d_vector = self.speaker_encoder.compute_embedding(spec) + return d_vector if isinstance(wav_file, list): - # compute the mean x_vector - x_vectors = None + # compute the mean d_vector + d_vectors = None for wf in wav_file: - x_vector = _compute(wf) - if x_vectors is None: - x_vectors = x_vector + d_vector = _compute(wf) + if d_vectors is None: + d_vectors = d_vector else: - x_vectors += x_vector - return (x_vectors / len(wav_file))[0].tolist() - x_vector = _compute(wav_file) - return x_vector[0].tolist() + d_vectors += d_vector + return (d_vectors / len(wav_file))[0].tolist() + d_vector = _compute(wav_file) + return d_vector[0].tolist() - def compute_x_vector(self, feats): + def compute_d_vector(self, feats: Union[torch.Tensor, np.ndarray]) -> List: + """Compute d_vector from features. + + Args: + feats (Union[torch.Tensor, np.ndarray]): Input features. + + Returns: + List: computed d_vector. + """ if isinstance(feats, np.ndarray): feats = torch.from_numpy(feats) if feats.ndim == 2: @@ -263,3 +287,90 @@ class SpeakerManager: def plot_embeddings(self): # TODO: implement speaker encoder raise NotImplementedError + + +def _set_file_path(path): + """Find the speakers.json under the given path or the above it. + Intended to band aid the different paths returned in restored and continued training.""" + path_restore = os.path.join(os.path.dirname(path), "speakers.json") + path_continue = os.path.join(path, "speakers.json") + if os.path.exists(path_restore): + return path_restore + if os.path.exists(path_continue): + return path_continue + raise FileNotFoundError(f" [!] `speakers.json` not found in {path}") + + +def load_speaker_mapping(out_path): + """Loads speaker mapping if already present.""" + if os.path.splitext(out_path)[1] == ".json": + json_file = out_path + else: + json_file = _set_file_path(out_path) + with open(json_file) as f: + return json.load(f) + + +def save_speaker_mapping(out_path, speaker_mapping): + """Saves speaker mapping if not yet present.""" + if out_path is not None: + speakers_json_path = _set_file_path(out_path) + with open(speakers_json_path, "w") as f: + json.dump(speaker_mapping, f, indent=4) + + +def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, out_path: str = None) -> SpeakerManager: + """Initiate a `SpeakerManager` instance by the provided config. + + Args: + c (Coqpit): Model configuration. + restore_path (str): Path to a previous training folder. + data (List): Data samples used in training to infer speakers from. It must be provided if speaker embedding + layers is used. Defaults to None. + out_path (str, optional): Save the generated speaker IDs to a output path. Defaults to None. + + Returns: + SpeakerManager: initialized and ready to use instance. + """ + speaker_manager = SpeakerManager() + if c.use_speaker_embedding: + if data is not None: + speaker_manager.set_speaker_ids_from_data(data) + if restore_path: + speakers_file = _set_file_path(restore_path) + # restoring speaker manager from a previous run. + if c.use_d_vector_file: + # restore speaker manager with the embedding file + if not os.path.exists(speakers_file): + print("WARNING: speakers.json was not found in restore_path, trying to use CONFIG.d_vector_file") + if not os.path.exists(c.d_vector_file): + raise RuntimeError( + "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.d_vector_file" + ) + speaker_manager.load_d_vectors_file(c.d_vector_file) + speaker_manager.set_d_vectors_from_file(speakers_file) + elif not c.use_d_vector_file: # restor speaker manager with speaker ID file. + speaker_ids_from_data = speaker_manager.speaker_ids + speaker_manager.set_speaker_ids_from_file(speakers_file) + assert all( + speaker in speaker_manager.speaker_ids for speaker in speaker_ids_from_data + ), " [!] You cannot introduce new speakers to a pre-trained model." + elif c.use_d_vector_file and c.d_vector_file: + # new speaker manager with external speaker embeddings. + speaker_manager.set_d_vectors_from_file(c.d_vector_file) + elif c.use_d_vector_file and not c.d_vector_file: # new speaker manager with speaker IDs file. + raise "use_d_vector_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" + print( + " > Training with {} speakers: {}".format( + speaker_manager.num_speakers, ", ".join(speaker_manager.speaker_ids) + ) + ) + # save file if path is defined + if out_path: + out_file_path = os.path.join(out_path, "speakers.json") + print(f" > Saving `speakers.json` to {out_file_path}.") + if c.use_d_vector_file and c.d_vector_file: + speaker_manager.save_d_vectors_to_file(out_file_path) + else: + speaker_manager.save_speaker_ids_to_file(out_file_path) + return speaker_manager diff --git a/TTS/tts/utils/ssim.py b/TTS/tts/utils/ssim.py index 11107e47..caed575f 100644 --- a/TTS/tts/utils/ssim.py +++ b/TTS/tts/utils/ssim.py @@ -56,9 +56,6 @@ class SSIM(torch.nn.Module): window = self.window else: window = create_window(self.window_size, channel) - - if img1.is_cuda: - window = window.cuda(img1.get_device()) window = window.type_as(img1) self.window = window @@ -69,10 +66,6 @@ class SSIM(torch.nn.Module): def ssim(img1, img2, window_size=11, size_average=True): (_, channel, _, _) = img1.size() - window = create_window(window_size, channel) - - if img1.is_cuda: - window = window.cuda(img1.get_device()) + window = create_window(window_size, channel).type_as(img1) window = window.type_as(img1) - return _ssim(img1, img2, window, window_size, channel, size_average) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 0ddf7ebe..39474cab 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -1,8 +1,10 @@ import os +from typing import Dict import numpy as np import pkg_resources import torch +from torch import nn from .text import phoneme_to_sequence, text_to_sequence @@ -13,7 +15,7 @@ if "tensorflow" in installed or "tensorflow-gpu" in installed: import tensorflow as tf -def text_to_seqvec(text, CONFIG): +def text_to_seq(text, CONFIG): text_cleaner = [CONFIG.text_cleaner] # text ot phonemes to sequence vector if CONFIG.use_phonemes: @@ -65,61 +67,45 @@ def compute_style_mel(style_wav, ap, cuda=False): return style_mel -def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None, speaker_embeddings=None): - if "tacotron" in CONFIG.model.lower(): - if CONFIG.gst: - decoder_output, postnet_output, alignments, stop_tokens = model.inference( - inputs, style_mel=style_mel, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings - ) - else: - if truncated: - decoder_output, postnet_output, alignments, stop_tokens = model.inference_truncated( - inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings - ) - else: - decoder_output, postnet_output, alignments, stop_tokens = model.inference( - inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings - ) - elif "glow" in CONFIG.model.lower(): - inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable - if hasattr(model, "module"): - # distributed model - postnet_output, _, _, _, alignments, _, _ = model.module.inference( - inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings - ) - else: - postnet_output, _, _, _, alignments, _, _ = model.inference( - inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings - ) - postnet_output = postnet_output.permute(0, 2, 1) - # these only belong to tacotron models. - decoder_output = None - stop_tokens = None - elif CONFIG.model.lower() in ["speedy_speech", "align_tts"]: - inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable - if hasattr(model, "module"): - # distributed model - postnet_output, alignments = model.module.inference( - inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings - ) - else: - postnet_output, alignments = model.inference( - inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings - ) - postnet_output = postnet_output.permute(0, 2, 1) - # these only belong to tacotron models. - decoder_output = None - stop_tokens = None +def run_model_torch( + model: nn.Module, + inputs: torch.Tensor, + speaker_id: int = None, + style_mel: torch.Tensor = None, + d_vector: torch.Tensor = None, +) -> Dict: + """Run a torch model for inference. It does not support batch inference. + + Args: + model (nn.Module): The model to run inference. + inputs (torch.Tensor): Input tensor with character ids. + speaker_id (int, optional): Input speaker ids for multi-speaker models. Defaults to None. + style_mel (torch.Tensor, optional): Spectrograms used for voice styling . Defaults to None. + d_vector (torch.Tensor, optional): d-vector for multi-speaker models . Defaults to None. + + Returns: + Dict: model outputs. + """ + input_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) + if hasattr(model, "module"): + _func = model.module.inference else: - raise ValueError("[!] Unknown model name.") - return decoder_output, postnet_output, alignments, stop_tokens + _func = model.inference + outputs = _func( + inputs, + aux_input={ + "x_lengths": input_lengths, + "speaker_ids": speaker_id, + "d_vectors": d_vector, + "style_mel": style_mel, + }, + ) + return outputs -def run_model_tf(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None): +def run_model_tf(model, inputs, CONFIG, speaker_id=None, style_mel=None): if CONFIG.gst and style_mel is not None: raise NotImplementedError(" [!] GST inference not implemented for TF") - if truncated: - raise NotImplementedError(" [!] Truncated inference not implemented for TF") if speaker_id is not None: raise NotImplementedError(" [!] Multi-Speaker not implemented for TF") # TODO: handle multispeaker case @@ -127,11 +113,9 @@ def run_model_tf(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=No return decoder_output, postnet_output, alignments, stop_tokens -def run_model_tflite(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None): +def run_model_tflite(model, inputs, CONFIG, speaker_id=None, style_mel=None): if CONFIG.gst and style_mel is not None: raise NotImplementedError(" [!] GST inference not implemented for TfLite") - if truncated: - raise NotImplementedError(" [!] Truncated inference not implemented for TfLite") if speaker_id is not None: raise NotImplementedError(" [!] Multi-Speaker not implemented for TfLite") # get input and output details @@ -152,14 +136,6 @@ def run_model_tflite(model, inputs, CONFIG, truncated, speaker_id=None, style_me return decoder_output, postnet_output, None, None -def parse_outputs_torch(postnet_output, decoder_output, alignments, stop_tokens): - postnet_output = postnet_output[0].data.cpu().numpy() - decoder_output = None if decoder_output is None else decoder_output[0].data.cpu().numpy() - alignment = alignments[0].cpu().data.numpy() - stop_tokens = None if stop_tokens is None else stop_tokens[0].cpu().numpy() - return postnet_output, decoder_output, alignment, stop_tokens - - def parse_outputs_tf(postnet_output, decoder_output, alignments, stop_tokens): postnet_output = postnet_output[0].numpy() decoder_output = decoder_output[0].numpy() @@ -186,23 +162,22 @@ def inv_spectrogram(postnet_output, ap, CONFIG): return wav -def id_to_torch(speaker_id, cuda=False): +def speaker_id_to_torch(speaker_id, cuda=False): if speaker_id is not None: speaker_id = np.asarray(speaker_id) - # TODO: test this for tacotron models speaker_id = torch.from_numpy(speaker_id) if cuda: return speaker_id.cuda() return speaker_id -def embedding_to_torch(speaker_embedding, cuda=False): - if speaker_embedding is not None: - speaker_embedding = np.asarray(speaker_embedding) - speaker_embedding = torch.from_numpy(speaker_embedding).unsqueeze(0).type(torch.FloatTensor) +def embedding_to_torch(d_vector, cuda=False): + if d_vector is not None: + d_vector = np.asarray(d_vector) + d_vector = torch.from_numpy(d_vector).unsqueeze(0).type(torch.FloatTensor) if cuda: - return speaker_embedding.cuda() - return speaker_embedding + return d_vector.cuda() + return d_vector # TODO: perform GL with pytorch for batching @@ -231,11 +206,10 @@ def synthesis( ap, speaker_id=None, style_wav=None, - truncated=False, enable_eos_bos_chars=False, # pylint: disable=unused-argument use_griffin_lim=False, do_trim_silence=False, - speaker_embedding=None, + d_vector=None, backend="torch", ): """Synthesize voice for the given text. @@ -249,8 +223,6 @@ def synthesis( model outputs. speaker_id (int): id of speaker style_wav (str | Dict[str, float]): Uses for style embedding of GST. - truncated (bool): keep model states after inference. It can be used - for continuous inference at long texts. enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence. do_trim_silence (bool): trim silence after synthesis. backend (str): tf or torch @@ -263,54 +235,54 @@ def synthesis( else: style_mel = compute_style_mel(style_wav, ap, cuda=use_cuda) # preprocess the given text - inputs = text_to_seqvec(text, CONFIG) + text_inputs = text_to_seq(text, CONFIG) # pass tensors to backend if backend == "torch": if speaker_id is not None: - speaker_id = id_to_torch(speaker_id, cuda=use_cuda) + speaker_id = speaker_id_to_torch(speaker_id, cuda=use_cuda) - if speaker_embedding is not None: - speaker_embedding = embedding_to_torch(speaker_embedding, cuda=use_cuda) + if d_vector is not None: + d_vector = embedding_to_torch(d_vector, cuda=use_cuda) if not isinstance(style_mel, dict): style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) - inputs = numpy_to_torch(inputs, torch.long, cuda=use_cuda) - inputs = inputs.unsqueeze(0) - elif backend == "tf": + text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=use_cuda) + text_inputs = text_inputs.unsqueeze(0) + elif backend in ["tf", "tflite"]: # TODO: handle speaker id for tf model style_mel = numpy_to_tf(style_mel, tf.float32) - inputs = numpy_to_tf(inputs, tf.int32) - inputs = tf.expand_dims(inputs, 0) - elif backend == "tflite": - style_mel = numpy_to_tf(style_mel, tf.float32) - inputs = numpy_to_tf(inputs, tf.int32) - inputs = tf.expand_dims(inputs, 0) + text_inputs = numpy_to_tf(text_inputs, tf.int32) + text_inputs = tf.expand_dims(text_inputs, 0) # synthesize voice if backend == "torch": - decoder_output, postnet_output, alignments, stop_tokens = run_model_torch( - model, inputs, CONFIG, truncated, speaker_id, style_mel, speaker_embeddings=speaker_embedding - ) - postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_torch( - postnet_output, decoder_output, alignments, stop_tokens - ) + outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, d_vector=d_vector) + model_outputs = outputs["model_outputs"] + model_outputs = model_outputs[0].data.cpu().numpy() + alignments = outputs["alignments"] elif backend == "tf": decoder_output, postnet_output, alignments, stop_tokens = run_model_tf( - model, inputs, CONFIG, truncated, speaker_id, style_mel + model, text_inputs, CONFIG, speaker_id, style_mel ) - postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_tf( + model_outputs, decoder_output, alignments, stop_tokens = parse_outputs_tf( postnet_output, decoder_output, alignments, stop_tokens ) elif backend == "tflite": - decoder_output, postnet_output, alignment, stop_tokens = run_model_tflite( - model, inputs, CONFIG, truncated, speaker_id, style_mel + decoder_output, postnet_output, alignments, stop_tokens = run_model_tflite( + model, text_inputs, CONFIG, speaker_id, style_mel ) - postnet_output, decoder_output = parse_outputs_tflite(postnet_output, decoder_output) + model_outputs, decoder_output = parse_outputs_tflite(postnet_output, decoder_output) # convert outputs to numpy # plot results wav = None if use_griffin_lim: - wav = inv_spectrogram(postnet_output, ap, CONFIG) + wav = inv_spectrogram(model_outputs, ap, CONFIG) # trim silence if do_trim_silence: wav = trim_silence(wav, ap) - return wav, alignment, decoder_output, postnet_output, stop_tokens, inputs + return_dict = { + "wav": wav, + "alignments": alignments, + "model_outputs": model_outputs, + "text_inputs": text_inputs, + } + return return_dict diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 787394b5..fdccf7f1 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +# adapted from https://github.com/keithito/tacotron import re import unicodedata diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 3d2caa97..4b041ed8 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -65,7 +65,7 @@ def basic_cleaners(text): def transliteration_cleaners(text): """Pipeline for non-English text that transliterates to ASCII.""" - text = convert_to_ascii(text) + # text = convert_to_ascii(text) text = lowercase(text) text = collapse_whitespace(text) return text @@ -89,7 +89,7 @@ def basic_turkish_cleaners(text): def english_cleaners(text): """Pipeline for English text, including number and abbreviation expansion.""" - text = convert_to_ascii(text) + # text = convert_to_ascii(text) text = lowercase(text) text = expand_time_english(text) text = expand_numbers(text) @@ -129,7 +129,7 @@ def chinese_mandarin_cleaners(text: str) -> str: def phoneme_cleaners(text): """Pipeline for phonemes mode, including number and abbreviation expansion.""" text = expand_numbers(text) - text = convert_to_ascii(text) + # text = convert_to_ascii(text) text = expand_abbreviations(text) text = replace_symbols(text) text = remove_aux_symbols(text) diff --git a/TTS/utils/arguments.py b/TTS/utils/arguments.py deleted file mode 100644 index 5e6acd1d..00000000 --- a/TTS/utils/arguments.py +++ /dev/null @@ -1,183 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -"""Argument parser for training scripts.""" - -import argparse -import glob -import os -import re - -import torch - -from TTS.config import load_config -from TTS.tts.utils.text.symbols import parse_symbols -from TTS.utils.console_logger import ConsoleLogger -from TTS.utils.generic_utils import create_experiment_folder, get_git_branch -from TTS.utils.io import copy_model_files -from TTS.utils.tensorboard_logger import TensorboardLogger - - -def init_arguments(argv): - """Parse command line arguments of training scripts. - - Args: - argv (list): This is a list of input arguments as given by sys.argv - - Returns: - argparse.Namespace: Parsed arguments. - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "--continue_path", - type=str, - help=( - "Training output folder to continue training. Used to continue " - "a training. If it is used, 'config_path' is ignored." - ), - default="", - required="--config_path" not in argv, - ) - parser.add_argument( - "--restore_path", type=str, help="Model file to be restored. Use to finetune a model.", default="" - ) - parser.add_argument( - "--best_path", - type=str, - help=( - "Best model file to be used for extracting best loss." - "If not specified, the latest best model in continue path is used" - ), - default="", - ) - parser.add_argument( - "--config_path", type=str, help="Path to config file for training.", required="--continue_path" not in argv - ) - parser.add_argument("--debug", type=bool, default=False, help="Do not verify commit integrity to run training.") - parser.add_argument("--rank", type=int, default=0, help="DISTRIBUTED: process rank for distributed training.") - parser.add_argument("--group_id", type=str, default="", help="DISTRIBUTED: process group id.") - - return parser - - -def get_last_checkpoint(path): - """Get latest checkpoint or/and best model in path. - - It is based on globbing for `*.pth.tar` and the RegEx - `(checkpoint|best_model)_([0-9]+)`. - - Args: - path (list): Path to files to be compared. - - Raises: - ValueError: If no checkpoint or best_model files are found. - - Returns: - last_checkpoint (str): Last checkpoint filename. - """ - file_names = glob.glob(os.path.join(path, "*.pth.tar")) - last_models = {} - last_model_nums = {} - for key in ["checkpoint", "best_model"]: - last_model_num = None - last_model = None - # pass all the checkpoint files and find - # the one with the largest model number suffix. - for file_name in file_names: - match = re.search(f"{key}_([0-9]+)", file_name) - if match is not None: - model_num = int(match.groups()[0]) - if last_model_num is None or model_num > last_model_num: - last_model_num = model_num - last_model = file_name - - # if there is not checkpoint found above - # find the checkpoint with the latest - # modification date. - key_file_names = [fn for fn in file_names if key in fn] - if last_model is None and len(key_file_names) > 0: - last_model = max(key_file_names, key=os.path.getctime) - last_model_num = torch.load(last_model)["step"] - - if last_model is not None: - last_models[key] = last_model - last_model_nums[key] = last_model_num - - # check what models were found - if not last_models: - raise ValueError(f"No models found in continue path {path}!") - if "checkpoint" not in last_models: # no checkpoint just best model - last_models["checkpoint"] = last_models["best_model"] - elif "best_model" not in last_models: # no best model - # this shouldn't happen, but let's handle it just in case - last_models["best_model"] = None - # finally check if last best model is more recent than checkpoint - elif last_model_nums["best_model"] > last_model_nums["checkpoint"]: - last_models["checkpoint"] = last_models["best_model"] - - return last_models["checkpoint"], last_models["best_model"] - - -def process_args(args): - """Process parsed comand line arguments. - - Args: - args (argparse.Namespace or dict like): Parsed input arguments. - - Returns: - c (TTS.utils.io.AttrDict): Config paramaters. - out_path (str): Path to save models and logging. - audio_path (str): Path to save generated test audios. - c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does - logging to the console. - tb_logger (TTS.utils.tensorboard.TensorboardLogger): Class that does - the TensorBoard loggind. - """ - if isinstance(args, tuple): - args, coqpit_overrides = args - if args.continue_path: - # continue a previous training from its output folder - experiment_path = args.continue_path - args.config_path = os.path.join(args.continue_path, "config.json") - args.restore_path, best_model = get_last_checkpoint(args.continue_path) - if not args.best_path: - args.best_path = best_model - # setup output paths and read configs - config = load_config(args.config_path) - # override values from command-line args - config.parse_known_args(coqpit_overrides, relaxed_parser=True) - if config.mixed_precision: - print(" > Mixed precision mode is ON") - experiment_path = args.continue_path - if not experiment_path: - experiment_path = create_experiment_folder(config.output_path, config.run_name, args.debug) - audio_path = os.path.join(experiment_path, "test_audios") - # setup rank 0 process in distributed training - tb_logger = None - if args.rank == 0: - os.makedirs(audio_path, exist_ok=True) - new_fields = {} - if args.restore_path: - new_fields["restore_path"] = args.restore_path - new_fields["github_branch"] = get_git_branch() - # if model characters are not set in the config file - # save the default set to the config file for future - # compatibility. - if config.has("characters_config"): - used_characters = parse_symbols() - new_fields["characters"] = used_characters - copy_model_files(config, experiment_path, new_fields) - os.chmod(audio_path, 0o775) - os.chmod(experiment_path, 0o775) - tb_logger = TensorboardLogger(experiment_path, model_name=config.model) - # write model desc to tensorboard - tb_logger.tb_add_text("model-config", f"
{config.to_json()}
", 0) - c_logger = ConsoleLogger() - return config, experiment_path, audio_path, c_logger, tb_logger - - -def init_training(argv): - """Initialization of a training run.""" - parser = init_arguments(argv) - args = parser.parse_known_args() - config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(args) - return args[0], config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index 222b4c74..27b52bef 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -1,14 +1,93 @@ +from typing import Dict, Tuple + import librosa import numpy as np import scipy.io.wavfile import scipy.signal import soundfile as sf +import torch +from torch import nn from TTS.tts.utils.data import StandardScaler # import pyworld as pw +class TorchSTFT(nn.Module): # pylint: disable=abstract-method + """TODO: Merge this with audio.py""" + + def __init__( + self, + n_fft, + hop_length, + win_length, + pad_wav=False, + window="hann_window", + sample_rate=None, + mel_fmin=0, + mel_fmax=None, + n_mels=80, + use_mel=False, + ): + super().__init__() + self.n_fft = n_fft + self.hop_length = hop_length + self.win_length = win_length + self.pad_wav = pad_wav + self.sample_rate = sample_rate + self.mel_fmin = mel_fmin + self.mel_fmax = mel_fmax + self.n_mels = n_mels + self.use_mel = use_mel + self.window = nn.Parameter(getattr(torch, window)(win_length), requires_grad=False) + self.mel_basis = None + if use_mel: + self._build_mel_basis() + + def __call__(self, x): + """Compute spectrogram frames by torch based stft. + + Args: + x (Tensor): input waveform + + Returns: + Tensor: spectrogram frames. + + Shapes: + x: [B x T] or [:math:`[B, 1, T]`] + """ + if x.ndim == 2: + x = x.unsqueeze(1) + if self.pad_wav: + padding = int((self.n_fft - self.hop_length) / 2) + x = torch.nn.functional.pad(x, (padding, padding), mode="reflect") + # B x D x T x 2 + o = torch.stft( + x.squeeze(1), + self.n_fft, + self.hop_length, + self.win_length, + self.window, + center=True, + pad_mode="reflect", # compatible with audio.py + normalized=False, + onesided=True, + return_complex=False, + ) + M = o[:, :, :, 0] + P = o[:, :, :, 1] + S = torch.sqrt(torch.clamp(M ** 2 + P ** 2, min=1e-8)) + if self.use_mel: + S = torch.matmul(self.mel_basis.to(x), S) + return S + + def _build_mel_basis(self): + mel_basis = librosa.filters.mel( + self.sample_rate, self.n_fft, n_mels=self.n_mels, fmin=self.mel_fmin, fmax=self.mel_fmax + ) + self.mel_basis = torch.from_numpy(mel_basis).float() + + # pylint: disable=too-many-public-methods class AudioProcessor(object): """Audio Processor for TTS used by all the data pipelines. @@ -140,7 +219,12 @@ class AudioProcessor(object): ### setting up the parameters ### def _build_mel_basis( self, - ): + ) -> np.ndarray: + """Build melspectrogram basis. + + Returns: + np.ndarray: melspectrogram basis. + """ if self.mel_fmax is not None: assert self.mel_fmax <= self.sample_rate // 2 return librosa.filters.mel( @@ -149,8 +233,12 @@ class AudioProcessor(object): def _stft_parameters( self, - ): - """Compute necessary stft parameters with given time values""" + ) -> Tuple[int, int]: + """Compute the real STFT parameters from the time values. + + Returns: + Tuple[int, int]: hop length and window length for STFT. + """ factor = self.frame_length_ms / self.frame_shift_ms assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms" hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate) @@ -158,8 +246,18 @@ class AudioProcessor(object): return hop_length, win_length ### normalization ### - def normalize(self, S): - """Put values in [0, self.max_norm] or [-self.max_norm, self.max_norm]""" + def normalize(self, S: np.ndarray) -> np.ndarray: + """Normalize values into `[0, self.max_norm]` or `[-self.max_norm, self.max_norm]` + + Args: + S (np.ndarray): Spectrogram to normalize. + + Raises: + RuntimeError: Mean and variance is computed from incompatible parameters. + + Returns: + np.ndarray: Normalized spectrogram. + """ # pylint: disable=no-else-return S = S.copy() if self.signal_norm: @@ -189,8 +287,18 @@ class AudioProcessor(object): else: return S - def denormalize(self, S): - """denormalize values""" + def denormalize(self, S: np.ndarray) -> np.ndarray: + """Denormalize spectrogram values. + + Args: + S (np.ndarray): Spectrogram to denormalize. + + Raises: + RuntimeError: Mean and variance are incompatible. + + Returns: + np.ndarray: Denormalized spectrogram. + """ # pylint: disable=no-else-return S_denorm = S.copy() if self.signal_norm: @@ -218,7 +326,16 @@ class AudioProcessor(object): return S_denorm ### Mean-STD scaling ### - def load_stats(self, stats_path): + def load_stats(self, stats_path: str) -> Tuple[np.array, np.array, np.array, np.array, Dict]: + """Loading mean and variance statistics from a `npy` file. + + Args: + stats_path (str): Path to the `npy` file containing + + Returns: + Tuple[np.array, np.array, np.array, np.array, Dict]: loaded statistics and the config used to + compute them. + """ stats = np.load(stats_path, allow_pickle=True).item() # pylint: disable=unexpected-keyword-arg mel_mean = stats["mel_mean"] mel_std = stats["mel_std"] @@ -237,7 +354,17 @@ class AudioProcessor(object): return mel_mean, mel_std, linear_mean, linear_std, stats_config # pylint: disable=attribute-defined-outside-init - def setup_scaler(self, mel_mean, mel_std, linear_mean, linear_std): + def setup_scaler( + self, mel_mean: np.ndarray, mel_std: np.ndarray, linear_mean: np.ndarray, linear_std: np.ndarray + ) -> None: + """Initialize scaler objects used in mean-std normalization. + + Args: + mel_mean (np.ndarray): Mean for melspectrograms. + mel_std (np.ndarray): STD for melspectrograms. + linear_mean (np.ndarray): Mean for full scale spectrograms. + linear_std (np.ndarray): STD for full scale spectrograms. + """ self.mel_scaler = StandardScaler() self.mel_scaler.set_stats(mel_mean, mel_std) self.linear_scaler = StandardScaler() @@ -245,32 +372,78 @@ class AudioProcessor(object): ### DB and AMP conversion ### # pylint: disable=no-self-use - def _amp_to_db(self, x): + def _amp_to_db(self, x: np.ndarray) -> np.ndarray: + """Convert amplitude values to decibels. + + Args: + x (np.ndarray): Amplitude spectrogram. + + Returns: + np.ndarray: Decibels spectrogram. + """ + return self.spec_gain * _log(np.maximum(1e-5, x), self.base) # pylint: disable=no-self-use - def _db_to_amp(self, x): + def _db_to_amp(self, x: np.ndarray) -> np.ndarray: + """Convert decibels spectrogram to amplitude spectrogram. + + Args: + x (np.ndarray): Decibels spectrogram. + + Returns: + np.ndarray: Amplitude spectrogram. + """ return _exp(x / self.spec_gain, self.base) ### Preemphasis ### - def apply_preemphasis(self, x): + def apply_preemphasis(self, x: np.ndarray) -> np.ndarray: + """Apply pre-emphasis to the audio signal. Useful to reduce the correlation between neighbouring signal values. + + Args: + x (np.ndarray): Audio signal. + + Raises: + RuntimeError: Preemphasis coeff is set to 0. + + Returns: + np.ndarray: Decorrelated audio signal. + """ if self.preemphasis == 0: raise RuntimeError(" [!] Preemphasis is set 0.0.") return scipy.signal.lfilter([1, -self.preemphasis], [1], x) - def apply_inv_preemphasis(self, x): + def apply_inv_preemphasis(self, x: np.ndarray) -> np.ndarray: + """Reverse pre-emphasis.""" if self.preemphasis == 0: raise RuntimeError(" [!] Preemphasis is set 0.0.") return scipy.signal.lfilter([1], [1, -self.preemphasis], x) ### SPECTROGRAMs ### - def _linear_to_mel(self, spectrogram): + def _linear_to_mel(self, spectrogram: np.ndarray) -> np.ndarray: + """Project a full scale spectrogram to a melspectrogram. + + Args: + spectrogram (np.ndarray): Full scale spectrogram. + + Returns: + np.ndarray: Melspectrogram + """ return np.dot(self.mel_basis, spectrogram) - def _mel_to_linear(self, mel_spec): + def _mel_to_linear(self, mel_spec: np.ndarray) -> np.ndarray: + """Convert a melspectrogram to full scale spectrogram.""" return np.maximum(1e-10, np.dot(self.inv_mel_basis, mel_spec)) - def spectrogram(self, y): + def spectrogram(self, y: np.ndarray) -> np.ndarray: + """Compute a spectrogram from a waveform. + + Args: + y (np.ndarray): Waveform. + + Returns: + np.ndarray: Spectrogram. + """ if self.preemphasis != 0: D = self._stft(self.apply_preemphasis(y)) else: @@ -278,7 +451,8 @@ class AudioProcessor(object): S = self._amp_to_db(np.abs(D)) return self.normalize(S).astype(np.float32) - def melspectrogram(self, y): + def melspectrogram(self, y: np.ndarray) -> np.ndarray: + """Compute a melspectrogram from a waveform.""" if self.preemphasis != 0: D = self._stft(self.apply_preemphasis(y)) else: @@ -286,8 +460,8 @@ class AudioProcessor(object): S = self._amp_to_db(self._linear_to_mel(np.abs(D))) return self.normalize(S).astype(np.float32) - def inv_spectrogram(self, spectrogram): - """Converts spectrogram to waveform using librosa""" + def inv_spectrogram(self, spectrogram: np.ndarray) -> np.ndarray: + """Convert a spectrogram to a waveform using Griffi-Lim vocoder.""" S = self.denormalize(spectrogram) S = self._db_to_amp(S) # Reconstruct phase @@ -295,8 +469,8 @@ class AudioProcessor(object): return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power)) return self._griffin_lim(S ** self.power) - def inv_melspectrogram(self, mel_spectrogram): - """Converts melspectrogram to waveform using librosa""" + def inv_melspectrogram(self, mel_spectrogram: np.ndarray) -> np.ndarray: + """Convert a melspectrogram to a waveform using Griffi-Lim vocoder.""" D = self.denormalize(mel_spectrogram) S = self._db_to_amp(D) S = self._mel_to_linear(S) # Convert back to linear @@ -304,7 +478,15 @@ class AudioProcessor(object): return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power)) return self._griffin_lim(S ** self.power) - def out_linear_to_mel(self, linear_spec): + def out_linear_to_mel(self, linear_spec: np.ndarray) -> np.ndarray: + """Convert a full scale linear spectrogram output of a network to a melspectrogram. + + Args: + linear_spec (np.ndarray): Normalized full scale linear spectrogram. + + Returns: + np.ndarray: Normalized melspectrogram. + """ S = self.denormalize(linear_spec) S = self._db_to_amp(S) S = self._linear_to_mel(np.abs(S)) @@ -313,7 +495,15 @@ class AudioProcessor(object): return mel ### STFT and ISTFT ### - def _stft(self, y): + def _stft(self, y: np.ndarray) -> np.ndarray: + """Librosa STFT wrapper. + + Args: + y (np.ndarray): Audio signal. + + Returns: + np.ndarray: Complex number array. + """ return librosa.stft( y=y, n_fft=self.fft_size, @@ -324,7 +514,8 @@ class AudioProcessor(object): center=True, ) - def _istft(self, y): + def _istft(self, y: np.ndarray) -> np.ndarray: + """Librosa iSTFT wrapper.""" return librosa.istft(y, hop_length=self.hop_length, win_length=self.win_length) def _griffin_lim(self, S): @@ -337,7 +528,8 @@ class AudioProcessor(object): return y def compute_stft_paddings(self, x, pad_sides=1): - """compute right padding (final frame) or both sides padding (first and final frames)""" + """Compute paddings used by Librosa's STFT. Compute right padding (final frame) or both sides padding + (first and final frames)""" assert pad_sides in (1, 2) pad = (x.shape[0] // self.hop_length + 1) * self.hop_length - x.shape[0] if pad_sides == 1: @@ -357,7 +549,17 @@ class AudioProcessor(object): # return f0 ### Audio Processing ### - def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8): + def find_endpoint(self, wav: np.ndarray, threshold_db=-40, min_silence_sec=0.8) -> int: + """Find the last point without silence at the end of a audio signal. + + Args: + wav (np.ndarray): Audio signal. + threshold_db (int, optional): Silence threshold in decibels. Defaults to -40. + min_silence_sec (float, optional): Ignore silences that are shorter then this in secs. Defaults to 0.8. + + Returns: + int: Last point without silence. + """ window_length = int(self.sample_rate * min_silence_sec) hop_length = int(window_length / 4) threshold = self._db_to_amp(threshold_db) @@ -375,11 +577,28 @@ class AudioProcessor(object): ] @staticmethod - def sound_norm(x): + def sound_norm(x: np.ndarray) -> np.ndarray: + """Normalize the volume of an audio signal. + + Args: + x (np.ndarray): Raw waveform. + + Returns: + np.ndarray: Volume normalized waveform. + """ return x / abs(x).max() * 0.95 ### save and load ### - def load_wav(self, filename, sr=None): + def load_wav(self, filename: str, sr: int = None) -> np.ndarray: + """Read a wav file using Librosa and optionally resample, silence trim, volume normalize. + + Args: + filename (str): Path to the wav file. + sr (int, optional): Sampling rate for resampling. Defaults to None. + + Returns: + np.ndarray: Loaded waveform. + """ if self.resample: x, sr = librosa.load(filename, sr=self.sample_rate) elif sr is None: @@ -396,12 +615,19 @@ class AudioProcessor(object): x = self.sound_norm(x) return x - def save_wav(self, wav, path, sr=None): + def save_wav(self, wav: np.ndarray, path: str, sr: int = None) -> None: + """Save a waveform to a file using Scipy. + + Args: + wav (np.ndarray): Waveform to save. + path (str): Path to a output file. + sr (int, optional): Sampling rate used for saving to the file. Defaults to None. + """ wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm.astype(np.int16)) @staticmethod - def mulaw_encode(wav, qc): + def mulaw_encode(wav: np.ndarray, qc: int) -> np.ndarray: mu = 2 ** qc - 1 # wav_abs = np.minimum(np.abs(wav), 1.0) signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1.0 + mu) @@ -423,11 +649,21 @@ class AudioProcessor(object): return np.clip(x * 2 ** 15, -(2 ** 15), 2 ** 15 - 1).astype(np.int16) @staticmethod - def quantize(x, bits): + def quantize(x: np.ndarray, bits: int) -> np.ndarray: + """Quantize a waveform to a given number of bits. + + Args: + x (np.ndarray): Waveform to quantize. Must be normalized into the range `[-1, 1]`. + bits (int): Number of quantization bits. + + Returns: + np.ndarray: Quantized waveform. + """ return (x + 1.0) * (2 ** bits - 1) / 2 @staticmethod def dequantize(x, bits): + """Dequantize a waveform from the given number of bits.""" return 2 * x / (2 ** bits - 1) - 1 diff --git a/TTS/utils/callbacks.py b/TTS/utils/callbacks.py new file mode 100644 index 00000000..18b6c34c --- /dev/null +++ b/TTS/utils/callbacks.py @@ -0,0 +1,75 @@ +class TrainerCallback: + def __init__(self, trainer): + super().__init__() + self.trainer = trainer + + def on_init_start(self) -> None: + if hasattr(self.trainer.model, "on_init_start"): + self.trainer.model.on_init_start(self.trainer) + + if hasattr(self.trainer.criterion, "on_init_start"): + self.trainer.criterion.on_init_start(self.trainer) + + if hasattr(self.trainer.optimizer, "on_init_start"): + self.trainer.optimizer.on_init_start(self.trainer) + + def on_init_end(self) -> None: + if hasattr(self.trainer.model, "on_init_end"): + self.trainer.model.on_init_end(self.trainer) + + if hasattr(self.trainer.criterion, "on_init_end"): + self.trainer.criterion.on_init_end(self.trainer) + + if hasattr(self.trainer.optimizer, "on_init_end"): + self.trainer.optimizer.on_init_end(self.trainer) + + def on_epoch_start(self) -> None: + if hasattr(self.trainer.model, "on_epoch_start"): + self.trainer.model.on_epoch_start(self.trainer) + + if hasattr(self.trainer.criterion, "on_epoch_start"): + self.trainer.criterion.on_epoch_start(self.trainer) + + if hasattr(self.trainer.optimizer, "on_epoch_start"): + self.trainer.optimizer.on_epoch_start(self.trainer) + + def on_epoch_end(self) -> None: + if hasattr(self.trainer.model, "on_epoch_end"): + self.trainer.model.on_epoch_end(self.trainer) + + if hasattr(self.trainer.criterion, "on_epoch_end"): + self.trainer.criterion.on_epoch_end(self.trainer) + + if hasattr(self.trainer.optimizer, "on_epoch_end"): + self.trainer.optimizer.on_epoch_end(self.trainer) + + def on_train_step_start(self) -> None: + if hasattr(self.trainer.model, "on_train_step_start"): + self.trainer.model.on_train_step_start(self.trainer) + + if hasattr(self.trainer.criterion, "on_train_step_start"): + self.trainer.criterion.on_train_step_start(self.trainer) + + if hasattr(self.trainer.optimizer, "on_train_step_start"): + self.trainer.optimizer.on_train_step_start(self.trainer) + + def on_train_step_end(self) -> None: + + if hasattr(self.trainer.model, "on_train_step_end"): + self.trainer.model.on_train_step_end(self.trainer) + + if hasattr(self.trainer.criterion, "on_train_step_end"): + self.trainer.criterion.on_train_step_end(self.trainer) + + if hasattr(self.trainer.optimizer, "on_train_step_end"): + self.trainer.optimizer.on_train_step_end(self.trainer) + + def on_keyboard_interrupt(self) -> None: + if hasattr(self.trainer.model, "on_keyboard_interrupt"): + self.trainer.model.on_keyboard_interrupt(self.trainer) + + if hasattr(self.trainer.criterion, "on_keyboard_interrupt"): + self.trainer.criterion.on_keyboard_interrupt(self.trainer) + + if hasattr(self.trainer.optimizer, "on_keyboard_interrupt"): + self.trainer.optimizer.on_keyboard_interrupt(self.trainer) diff --git a/TTS/utils/distribute.py b/TTS/utils/distribute.py index 7a1078e8..1c6b0e1c 100644 --- a/TTS/utils/distribute.py +++ b/TTS/utils/distribute.py @@ -1,53 +1,8 @@ # edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py -import math - import torch import torch.distributed as dist from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors from torch.autograd import Variable -from torch.utils.data.sampler import Sampler - - -class DistributedSampler(Sampler): - """ - Non shuffling Distributed Sampler - """ - - def __init__(self, dataset, num_replicas=None, rank=None): - super().__init__(dataset) - if num_replicas is None: - if not dist.is_available(): - raise RuntimeError("Requires distributed package to be available") - num_replicas = dist.get_world_size() - if rank is None: - if not dist.is_available(): - raise RuntimeError("Requires distributed package to be available") - rank = dist.get_rank() - self.dataset = dataset - self.num_replicas = num_replicas - self.rank = rank - self.epoch = 0 - self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) - self.total_size = self.num_samples * self.num_replicas - - def __iter__(self): - indices = torch.arange(len(self.dataset)).tolist() - - # add extra samples to make it evenly divisible - indices += indices[: (self.total_size - len(indices))] - assert len(indices) == self.total_size - - # subsample - indices = indices[self.rank : self.total_size : self.num_replicas] - assert len(indices) == self.num_samples - - return iter(indices) - - def __len__(self): - return self.num_samples - - def set_epoch(self, epoch): - self.epoch = epoch def reduce_tensor(tensor, num_gpus): diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index a562e86f..e7c57529 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -8,10 +8,21 @@ import shutil import subprocess import sys from pathlib import Path +from typing import Dict import torch +def to_cuda(x: torch.Tensor) -> torch.Tensor: + if x is None: + return None + if torch.is_tensor(x): + x = x.contiguous() + if torch.cuda.is_available(): + x = x.cuda(non_blocking=True) + return x + + def get_cuda(): use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @@ -47,13 +58,10 @@ def get_commit_hash(): return commit -def create_experiment_folder(root_path, model_name, debug): +def create_experiment_folder(root_path, model_name): """Create a folder with the current date and time""" date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p") - if debug: - commit_hash = "debug" - else: - commit_hash = get_commit_hash() + commit_hash = get_commit_hash() output_folder = os.path.join(root_path, model_name + "-" + date_str + "-" + commit_hash) os.makedirs(output_folder, exist_ok=True) print(" > Experiment folder: {}".format(output_folder)) @@ -126,6 +134,22 @@ def set_init_dict(model_dict, checkpoint_state, c): return model_dict +def format_aux_input(def_args: Dict, kwargs: Dict) -> Dict: + """Format kwargs to hande auxilary inputs to models. + + Args: + def_args (Dict): A dictionary of argument names and their default values if not defined in `kwargs`. + kwargs (Dict): A `dict` or `kwargs` that includes auxilary inputs to the model. + + Returns: + Dict: arguments with formatted auxilary inputs. + """ + for name in def_args: + if name not in kwargs: + kwargs[def_args[name]] = None + return kwargs + + class KeepAverage: def __init__(self): self.avg_values = {} diff --git a/TTS/utils/io.py b/TTS/utils/io.py index 62d972f1..871cff6c 100644 --- a/TTS/utils/io.py +++ b/TTS/utils/io.py @@ -1,7 +1,12 @@ +import datetime +import glob import os import pickle as pickle_tts from shutil import copyfile +import torch +from coqpit import Coqpit + class RenamingUnpickler(pickle_tts.Unpickler): """Overload default pickler to solve module renaming problem""" @@ -41,3 +46,119 @@ def copy_model_files(config, out_path, new_fields): config.audio.stats_path, copy_stats_path, ) + + +def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False): # pylint: disable=redefined-builtin + try: + state = torch.load(checkpoint_path, map_location=torch.device("cpu")) + except ModuleNotFoundError: + pickle_tts.Unpickler = RenamingUnpickler + state = torch.load(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts) + model.load_state_dict(state["model"]) + if use_cuda: + model.cuda() + if eval: + model.eval() + return model, state + + +def save_model(config, model, optimizer, scaler, current_step, epoch, output_path, **kwargs): + if hasattr(model, "module"): + model_state = model.module.state_dict() + else: + model_state = model.state_dict() + if isinstance(optimizer, list): + optimizer_state = [optim.state_dict() for optim in optimizer] + else: + optimizer_state = optimizer.state_dict() if optimizer is not None else None + + if isinstance(scaler, list): + scaler_state = [s.state_dict() for s in scaler] + else: + scaler_state = scaler.state_dict() if scaler is not None else None + + if isinstance(config, Coqpit): + config = config.to_dict() + + state = { + "config": config, + "model": model_state, + "optimizer": optimizer_state, + "scaler": scaler_state, + "step": current_step, + "epoch": epoch, + "date": datetime.date.today().strftime("%B %d, %Y"), + } + state.update(kwargs) + torch.save(state, output_path) + + +def save_checkpoint( + config, + model, + optimizer, + scaler, + current_step, + epoch, + output_folder, + **kwargs, +): + file_name = "checkpoint_{}.pth.tar".format(current_step) + checkpoint_path = os.path.join(output_folder, file_name) + print("\n > CHECKPOINT : {}".format(checkpoint_path)) + save_model( + config, + model, + optimizer, + scaler, + current_step, + epoch, + checkpoint_path, + **kwargs, + ) + + +def save_best_model( + current_loss, + best_loss, + config, + model, + optimizer, + scaler, + current_step, + epoch, + out_path, + keep_all_best=False, + keep_after=10000, + **kwargs, +): + if current_loss < best_loss: + best_model_name = f"best_model_{current_step}.pth.tar" + checkpoint_path = os.path.join(out_path, best_model_name) + print(" > BEST MODEL : {}".format(checkpoint_path)) + save_model( + config, + model, + optimizer, + scaler, + current_step, + epoch, + checkpoint_path, + model_loss=current_loss, + **kwargs, + ) + # only delete previous if current is saved successfully + if not keep_all_best or (current_step < keep_after): + model_names = glob.glob(os.path.join(out_path, "best_model*.pth.tar")) + for model_name in model_names: + if os.path.basename(model_name) == best_model_name: + continue + os.remove(model_name) + # create symlink to best model for convinience + link_name = "best_model.pth.tar" + link_path = os.path.join(out_path, link_name) + if os.path.islink(link_path) or os.path.isfile(link_path): + os.remove(link_path) + os.symlink(best_model_name, os.path.join(out_path, link_name)) + best_loss = current_loss + return best_loss diff --git a/TTS/utils/logging/__init__.py b/TTS/utils/logging/__init__.py new file mode 100644 index 00000000..877131c4 --- /dev/null +++ b/TTS/utils/logging/__init__.py @@ -0,0 +1,2 @@ +from TTS.utils.logging.console_logger import ConsoleLogger +from TTS.utils.logging.tensorboard_logger import TensorboardLogger diff --git a/TTS/utils/console_logger.py b/TTS/utils/logging/console_logger.py similarity index 95% rename from TTS/utils/console_logger.py rename to TTS/utils/logging/console_logger.py index 7d6e1968..bb6644c9 100644 --- a/TTS/utils/console_logger.py +++ b/TTS/utils/logging/console_logger.py @@ -68,11 +68,10 @@ class ConsoleLogger: print(log_text, flush=True) def print_eval_start(self): - print(f"{tcolors.BOLD} > EVALUATION {tcolors.ENDC}\n") + print(f"\n{tcolors.BOLD} > EVALUATION {tcolors.ENDC}\n") def print_eval_step(self, step, loss_dict, avg_loss_dict): indent = " | > " - print() log_text = f"{tcolors.BOLD} --> STEP: {step}{tcolors.ENDC}\n" for key, value in loss_dict.items(): # print the avg value if given @@ -84,7 +83,7 @@ class ConsoleLogger: def print_epoch_end(self, epoch, avg_loss_dict): indent = " | > " - log_text = " {}--> EVAL PERFORMANCE{}\n".format(tcolors.BOLD, tcolors.ENDC) + log_text = "\n {}--> EVAL PERFORMANCE{}\n".format(tcolors.BOLD, tcolors.ENDC) for key, value in avg_loss_dict.items(): # print the avg value if given color = "" diff --git a/TTS/utils/tensorboard_logger.py b/TTS/utils/logging/tensorboard_logger.py similarity index 95% rename from TTS/utils/tensorboard_logger.py rename to TTS/utils/logging/tensorboard_logger.py index 3874a42b..3d7ea1e6 100644 --- a/TTS/utils/tensorboard_logger.py +++ b/TTS/utils/logging/tensorboard_logger.py @@ -34,12 +34,14 @@ class TensorboardLogger(object): def dict_to_tb_audios(self, scope_name, audios, step, sample_rate): for key, value in audios.items(): + if value.dtype == "float16": + value = value.astype("float32") try: self.writer.add_audio("{}/{}".format(scope_name, key), value, step, sample_rate=sample_rate) except RuntimeError: traceback.print_exc() - def tb_train_iter_stats(self, step, stats): + def tb_train_step_stats(self, step, stats): self.dict_to_tb_scalar(f"{self.model_name}_TrainIterStats", stats, step) def tb_train_epoch_stats(self, step, stats): diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index f5165079..d5e8d410 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -3,7 +3,7 @@ import json import os import zipfile from pathlib import Path -from shutil import copyfile +from shutil import copyfile, rmtree import gdown import requests @@ -83,7 +83,7 @@ class ModelManager(object): 'type/language/dataset/model' e.g. 'tts_model/en/ljspeech/tacotron' - Every model must have the following files + Every model must have the following files: - *.pth.tar : pytorch model checkpoint file. - config.json : model config file. - scale_stats.npy (if exist): scale values for preprocessing. @@ -101,11 +101,7 @@ class ModelManager(object): output_path = os.path.join(self.output_prefix, model_full_name) output_model_path = os.path.join(output_path, "model_file.pth.tar") output_config_path = os.path.join(output_path, "config.json") - # NOTE : band-aid for removing phoneme support - # if "needs_phonemizer" in model_item and model_item["needs_phonemizer"]: - # raise RuntimeError( - # " [!] Use 🐸TTS <= v0.0.13 for this model. Current version does not support phoneme based models." - # ) + if os.path.exists(output_path): print(f" > {model_name} is already downloaded.") else: @@ -116,7 +112,6 @@ class ModelManager(object): # download files to the output path if self._check_dict_key(model_item, "github_rls_url"): # download from github release - # TODO: pass output_path self._download_zip_file(model_item["github_rls_url"], output_path) else: # download from gdrive @@ -137,7 +132,7 @@ class ModelManager(object): # set scale stats path in config.json config_path = output_config_path config = load_config(config_path) - config.external_speaker_embedding_file = output_speakers_path + config.d_vector_file = output_speakers_path config.save_json(config_path) return output_model_path, output_config_path, model_item @@ -146,15 +141,20 @@ class ModelManager(object): gdown.download(f"{self.url_prefix}{gdrive_idx}", output=output, quiet=False) @staticmethod - def _download_zip_file(file_url, output): + def _download_zip_file(file_url, output_folder): """Download the github releases""" + # download the file r = requests.get(file_url) + # extract the file with zipfile.ZipFile(io.BytesIO(r.content)) as z: - z.extractall(output) + z.extractall(output_folder) + # move the files to the outer path for file_path in z.namelist()[1:]: - src_path = os.path.join(output, file_path) - dst_path = os.path.join(output, os.path.basename(file_path)) + src_path = os.path.join(output_folder, file_path) + dst_path = os.path.join(output_folder, os.path.basename(file_path)) copyfile(src_path, dst_path) + # remove the extracted folder + rmtree(os.path.join(output_folder, z.namelist()[0])) @staticmethod def _check_dict_key(my_dict, key): diff --git a/TTS/utils/radam.py b/TTS/utils/radam.py index b6c86fed..73426e64 100644 --- a/TTS/utils/radam.py +++ b/TTS/utils/radam.py @@ -1,4 +1,4 @@ -# from https://github.com/LiyuanLucasLiu/RAdam +# modified from https://github.com/LiyuanLucasLiu/RAdam import math diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index bca3df31..56a8c9b2 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -6,7 +6,7 @@ import pysbd import torch from TTS.config import load_config -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.models import setup_model as setup_tts_model from TTS.tts.utils.speakers import SpeakerManager # pylint: disable=unused-wildcard-import @@ -14,7 +14,8 @@ from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.synthesis import synthesis, trim_silence from TTS.tts.utils.text import make_symbols, phonemes, symbols from TTS.utils.audio import AudioProcessor -from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input, setup_generator +from TTS.vocoder.models import setup_model as setup_vocoder_model +from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input class Synthesizer(object): @@ -63,7 +64,7 @@ class Synthesizer(object): self.speaker_manager = None self.num_speakers = 0 self.tts_speakers = {} - self.speaker_embedding_dim = 0 + self.d_vector_dim = 0 self.seg = self._get_segmenter("en") self.use_cuda = use_cuda @@ -98,9 +99,9 @@ class Synthesizer(object): self.speaker_manager = SpeakerManager( encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config ) - self.speaker_manager.load_x_vectors_file(self.tts_config.get("external_speaker_embedding_file", speaker_file)) + self.speaker_manager.load_d_vectors_file(self.tts_config.get("d_vector_file", speaker_file)) self.num_speakers = self.speaker_manager.num_speakers - self.speaker_embedding_dim = self.speaker_manager.x_vector_dim + self.d_vector_dim = self.speaker_manager.d_vector_dim def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None: """Load the TTS model. @@ -127,16 +128,11 @@ class Synthesizer(object): if self.tts_config.use_speaker_embedding is True: self.tts_speakers_file = ( - self.tts_speakers_file if self.tts_speakers_file else self.tts_config["external_speaker_embedding_file"] + self.tts_speakers_file if self.tts_speakers_file else self.tts_config["d_vector_file"] ) - self._load_speakers(self.tts_speakers_file) + self.tts_config["d_vector_file"] = self.tts_speakers_file - self.tts_model = setup_model( - self.input_size, - num_speakers=self.num_speakers, - c=self.tts_config, - speaker_embedding_dim=self.speaker_embedding_dim, - ) + self.tts_model = setup_tts_model(config=self.tts_config) self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda() @@ -151,7 +147,7 @@ class Synthesizer(object): """ self.vocoder_config = load_config(model_config) self.vocoder_ap = AudioProcessor(verbose=False, **self.vocoder_config.audio) - self.vocoder_model = setup_generator(self.vocoder_config) + self.vocoder_model = setup_vocoder_model(self.vocoder_config) self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True) if use_cuda: self.vocoder_model.cuda() @@ -197,9 +193,9 @@ class Synthesizer(object): print(sens) if self.tts_speakers_file: - # get the speaker embedding from the saved x_vectors. + # get the speaker embedding from the saved d_vectors. if speaker_idx and isinstance(speaker_idx, str): - speaker_embedding = self.speaker_manager.get_x_vectors_by_speaker(speaker_idx)[0] + speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(speaker_idx)[0] elif not speaker_idx and not speaker_wav: raise ValueError( " [!] Look like you use a multi-speaker model. " @@ -214,15 +210,15 @@ class Synthesizer(object): "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. " ) - # compute a new x_vector from the given clip. + # compute a new d_vector from the given clip. if speaker_wav is not None: - speaker_embedding = self.speaker_manager.compute_x_vector_from_clip(speaker_wav) + speaker_embedding = self.speaker_manager.compute_d_vector_from_clip(speaker_wav) use_gl = self.vocoder_model is None for sen in sens: # synthesize voice - waveform, _, _, mel_postnet_spec, _, _ = synthesis( + outputs = synthesis( model=self.tts_model, text=sen, CONFIG=self.tts_config, @@ -230,11 +226,12 @@ class Synthesizer(object): ap=self.ap, speaker_id=None, style_wav=style_wav, - truncated=False, enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars, use_griffin_lim=use_gl, - speaker_embedding=speaker_embedding, + d_vector=speaker_embedding, ) + waveform = outputs["wav"] + mel_postnet_spec = outputs["model_outputs"] if not use_gl: # denormalize tts output based on tts audio config mel_postnet_spec = self.ap.denormalize(mel_postnet_spec.T).T diff --git a/TTS/utils/trainer_utils.py b/TTS/utils/trainer_utils.py new file mode 100644 index 00000000..29915527 --- /dev/null +++ b/TTS/utils/trainer_utils.py @@ -0,0 +1,69 @@ +import importlib +from typing import Dict + +import torch + +from TTS.utils.training import NoamLR + + +def is_apex_available(): + return importlib.util.find_spec("apex") is not None + + +def setup_torch_training_env(cudnn_enable, cudnn_benchmark): + num_gpus = torch.cuda.device_count() + if num_gpus > 1: + raise RuntimeError( + f" [!] {num_gpus} active GPUs. Define the target GPU by `CUDA_VISIBLE_DEVICES`. For multi-gpu training use `TTS/bin/distribute.py`." + ) + torch.backends.cudnn.enabled = cudnn_enable + torch.backends.cudnn.benchmark = cudnn_benchmark + torch.manual_seed(54321) + use_cuda = torch.cuda.is_available() + print(" > Using CUDA: ", use_cuda) + print(" > Number of GPUs: ", num_gpus) + return use_cuda, num_gpus + + +def get_scheduler( + lr_scheduler: str, lr_scheduler_params: Dict, optimizer: torch.optim.Optimizer +) -> torch.optim.lr_scheduler._LRScheduler: # pylint: disable=protected-access + """Find, initialize and return a scheduler. + + Args: + lr_scheduler (str): Scheduler name. + lr_scheduler_params (Dict): Scheduler parameters. + optimizer (torch.optim.Optimizer): Optimizer to pass to the scheduler. + + Returns: + torch.optim.lr_scheduler._LRScheduler: Functional scheduler. + """ + if lr_scheduler is None: + return None + if lr_scheduler.lower() == "noamlr": + scheduler = NoamLR + else: + scheduler = getattr(torch.optim.lr_scheduler, lr_scheduler) + return scheduler(optimizer, **lr_scheduler_params) + + +def get_optimizer( + optimizer_name: str, optimizer_params: dict, lr: float, model: torch.nn.Module +) -> torch.optim.Optimizer: + """Find, initialize and return a optimizer. + + Args: + optimizer_name (str): Optimizer name. + optimizer_params (dict): Optimizer parameters. + lr (float): Initial learning rate. + model (torch.nn.Module): Model to pass to the optimizer. + + Returns: + torch.optim.Optimizer: Functional optimizer. + """ + if optimizer_name.lower() == "radam": + module = importlib.import_module("TTS.utils.radam") + optimizer = getattr(module, "RAdam") + else: + optimizer = getattr(torch.optim, optimizer_name) + return optimizer(model.parameters(), lr=lr, **optimizer_params) diff --git a/TTS/utils/training.py b/TTS/utils/training.py index 37b32637..aa5651c5 100644 --- a/TTS/utils/training.py +++ b/TTS/utils/training.py @@ -2,17 +2,6 @@ import numpy as np import torch -def setup_torch_training_env(cudnn_enable, cudnn_benchmark): - torch.backends.cudnn.enabled = cudnn_enable - torch.backends.cudnn.benchmark = cudnn_benchmark - torch.manual_seed(54321) - use_cuda = torch.cuda.is_available() - num_gpus = torch.cuda.device_count() - print(" > Using CUDA: ", use_cuda) - print(" > Number of GPUs: ", num_gpus) - return use_cuda, num_gpus - - def check_update(model, grad_clip, ignore_stopnet=False, amp_opt_params=None): r"""Check model gradient against unexpected jumps and failures""" skip_flag = False @@ -41,46 +30,6 @@ def check_update(model, grad_clip, ignore_stopnet=False, amp_opt_params=None): return grad_norm, skip_flag -def lr_decay(init_lr, global_step, warmup_steps): - r"""from https://github.com/r9y9/tacotron_pytorch/blob/master/train.py""" - warmup_steps = float(warmup_steps) - step = global_step + 1.0 - lr = init_lr * warmup_steps ** 0.5 * np.minimum(step * warmup_steps ** -1.5, step ** -0.5) - return lr - - -def adam_weight_decay(optimizer): - """ - Custom weight decay operation, not effecting grad values. - """ - for group in optimizer.param_groups: - for param in group["params"]: - current_lr = group["lr"] - weight_decay = group["weight_decay"] - factor = -weight_decay * group["lr"] - param.data = param.data.add(param.data, alpha=factor) - return optimizer, current_lr - - -# pylint: disable=dangerous-default-value -def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v", "rnn", "lstm", "gru", "embedding"}): - """ - Skip biases, BatchNorm parameters, rnns. - and attention projection layer v - """ - decay = [] - no_decay = [] - for name, param in model.named_parameters(): - if not param.requires_grad: - continue - - if len(param.shape) == 1 or any((skip_name in name for skip_name in skip_list)): - no_decay.append(param) - else: - decay.append(param) - return [{"params": no_decay, "weight_decay": 0.0}, {"params": decay, "weight_decay": weight_decay}] - - # pylint: disable=protected-access class NoamLR(torch.optim.lr_scheduler._LRScheduler): def __init__(self, optimizer, warmup_steps=0.1, last_epoch=-1): @@ -107,3 +56,31 @@ def gradual_training_scheduler(global_step, config): if global_step * num_gpus >= values[0]: new_values = values return new_values[1], new_values[2] + + +def lr_decay(init_lr, global_step, warmup_steps): + r"""from https://github.com/r9y9/tacotron_pytorch/blob/master/train.py + It is only being used by the Speaker Encoder trainer.""" + warmup_steps = float(warmup_steps) + step = global_step + 1.0 + lr = init_lr * warmup_steps ** 0.5 * np.minimum(step * warmup_steps ** -1.5, step ** -0.5) + return lr + + +# pylint: disable=dangerous-default-value +def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v", "rnn", "lstm", "gru", "embedding"}): + """ + Skip biases, BatchNorm parameters, rnns. + and attention projection layer v + """ + decay = [] + no_decay = [] + for name, param in model.named_parameters(): + if not param.requires_grad: + continue + + if len(param.shape) == 1 or any((skip_name in name for skip_name in skip_list)): + no_decay.append(param) + else: + decay.append(param) + return [{"params": no_decay, "weight_decay": 0.0}, {"params": decay, "weight_decay": weight_decay}] diff --git a/TTS/vocoder/configs/fullband_melgan_config.py b/TTS/vocoder/configs/fullband_melgan_config.py index 53444214..2ab83aac 100644 --- a/TTS/vocoder/configs/fullband_melgan_config.py +++ b/TTS/vocoder/configs/fullband_melgan_config.py @@ -14,7 +14,7 @@ class FullbandMelganConfig(BaseGANVocoderConfig): Args: model (str): - Model name used for selecting the right model at initialization. Defaults to `melgan`. + Model name used for selecting the right model at initialization. Defaults to `fullband_melgan`. discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to 'melgan_multiscale_discriminator`. discriminator_model_params (dict): The discriminator model parameters. Defaults to @@ -62,7 +62,7 @@ class FullbandMelganConfig(BaseGANVocoderConfig): L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. """ - model: str = "melgan" + model: str = "fullband_melgan" # Model specific params discriminator_model: str = "melgan_multiscale_discriminator" diff --git a/TTS/vocoder/configs/multiband_melgan_config.py b/TTS/vocoder/configs/multiband_melgan_config.py index 81fd7904..76311353 100644 --- a/TTS/vocoder/configs/multiband_melgan_config.py +++ b/TTS/vocoder/configs/multiband_melgan_config.py @@ -14,7 +14,7 @@ class MultibandMelganConfig(BaseGANVocoderConfig): Args: model (str): - Model name used for selecting the right model at initialization. Defaults to `melgan`. + Model name used for selecting the right model at initialization. Defaults to `multiband_melgan`. discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to 'melgan_multiscale_discriminator`. discriminator_model_params (dict): The discriminator model parameters. Defaults to diff --git a/TTS/vocoder/configs/parallel_wavegan_config.py b/TTS/vocoder/configs/parallel_wavegan_config.py index d132d2e1..a89b1f3f 100644 --- a/TTS/vocoder/configs/parallel_wavegan_config.py +++ b/TTS/vocoder/configs/parallel_wavegan_config.py @@ -9,7 +9,7 @@ class ParallelWaveganConfig(BaseGANVocoderConfig): Args: model (str): - Model name used for selecting the right configuration at initialization. Defaults to `parallel_wavegan`. + Model name used for selecting the right configuration at initialization. Defaults to `gan`. discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to 'parallel_wavegan_discriminator`. discriminator_model_params (dict): The discriminator model kwargs. Defaults to diff --git a/TTS/vocoder/configs/shared_configs.py b/TTS/vocoder/configs/shared_configs.py index 664032d2..6891ce6c 100644 --- a/TTS/vocoder/configs/shared_configs.py +++ b/TTS/vocoder/configs/shared_configs.py @@ -34,6 +34,10 @@ class BaseVocoderConfig(BaseTrainingConfig): Number of training epochs to. Defaults to 10000. wd (float): Weight decay. + optimizer (torch.optim.Optimizer): + Optimizer used for the training. Defaults to `AdamW`. + optimizer_params (dict): + Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}` """ audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) @@ -50,6 +54,8 @@ class BaseVocoderConfig(BaseTrainingConfig): # OPTIMIZER epochs: int = 10000 # total number of epochs to train. wd: float = 0.0 # Weight decay weight. + optimizer: str = "AdamW" + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "weight_decay": 0.0}) @dataclass @@ -96,20 +102,13 @@ class BaseGANVocoderConfig(BaseVocoderConfig): }` target_loss (str): Target loss name that defines the quality of the model. Defaults to `avg_G_loss`. - gen_clip_grad (float): - Gradient clipping threshold for the generator model. Any value less than 0 disables clipping. - Defaults to -1. - disc_clip_grad (float): - Gradient clipping threshold for the discriminator model. Any value less than 0 disables clipping. - Defaults to -1. + grad_clip (list): + A list of gradient clipping theresholds for each optimizer. Any value less than 0 disables clipping. + Defaults to [5, 5]. lr_gen (float): Generator model initial learning rate. Defaults to 0.0002. lr_disc (float): Discriminator model initial learning rate. Defaults to 0.0002. - optimizer (torch.optim.Optimizer): - Optimizer used for the training. Defaults to `AdamW`. - optimizer_params (dict): - Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}` lr_scheduler_gen (torch.optim.Scheduler): Learning rate scheduler for the generator. Defaults to `ExponentialLR`. lr_scheduler_gen_params (dict): @@ -127,6 +126,8 @@ class BaseGANVocoderConfig(BaseVocoderConfig): Enabling it results in slower iterations but faster convergance in some cases. Defaults to False. """ + model: str = "gan" + # LOSS PARAMETERS use_stft_loss: bool = True use_subband_stft_loss: bool = True @@ -164,15 +165,12 @@ class BaseGANVocoderConfig(BaseVocoderConfig): } ) - target_loss: str = "avg_G_loss" # loss value to pick the best model to save after each epoch + target_loss: str = "loss_0" # loss value to pick the best model to save after each epoch # optimizer - gen_clip_grad: float = -1 # Generator gradient clipping threshold. Apply gradient clipping if > 0 - disc_clip_grad: float = -1 # Discriminator gradient clipping threshold. + grad_clip: float = field(default_factory=lambda: [5, 5]) lr_gen: float = 0.0002 # Initial learning rate. lr_disc: float = 0.0002 # Initial learning rate. - optimizer: str = "AdamW" - optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "weight_decay": 0.0}) lr_scheduler_gen: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1}) lr_scheduler_disc: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html diff --git a/TTS/vocoder/configs/univnet_config.py b/TTS/vocoder/configs/univnet_config.py new file mode 100644 index 00000000..85662831 --- /dev/null +++ b/TTS/vocoder/configs/univnet_config.py @@ -0,0 +1,160 @@ +from dataclasses import dataclass, field + +from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig + + +@dataclass +class UnivnetConfig(BaseGANVocoderConfig): + """Defines parameters for UnivNet vocoder. + + Example: + + >>> from TTS.vocoder.configs import UnivNetConfig + >>> config = UnivNetConfig() + + Args: + model (str): + Model name used for selecting the right model at initialization. Defaults to `UnivNet`. + discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to + 'UnivNet_discriminator`. + generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is + considered as a generator too. Defaults to `UnivNet_generator`. + generator_model_params (dict): Parameters of the generator model. Defaults to + ` + { + "use_mel": True, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": None, + } + ` + batch_size (int): + Batch size used at training. Larger values use more memory. Defaults to 32. + seq_len (int): + Audio segment length used at training. Larger values use more memory. Defaults to 8192. + pad_short (int): + Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0. + use_noise_augment (bool): + enable / disable random noise added to the input waveform. The noise is added after computing the + features. Defaults to True. + use_cache (bool): + enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is + not large enough. Defaults to True. + use_stft_loss (bool): + enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True. + use_subband_stft (bool): + enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True. + use_mse_gan_loss (bool): + enable / disable using Mean Squeare Error GAN loss. Defaults to True. + use_hinge_gan_loss (bool): + enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models. + Defaults to False. + use_feat_match_loss (bool): + enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True. + use_l1_spec_loss (bool): + enable / disable using L1 spectrogram loss originally used by univnet model. Defaults to False. + stft_loss_params (dict): + STFT loss parameters. Default to + `{ + "n_ffts": [1024, 2048, 512], + "hop_lengths": [120, 240, 50], + "win_lengths": [600, 1200, 240] + }` + l1_spec_loss_params (dict): + L1 spectrogram loss parameters. Default to + `{ + "use_mel": True, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": None, + }` + stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total + model loss. Defaults to 0.5. + subband_stft_loss_weight (float): + Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. + mse_G_loss_weight (float): + MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5. + hinge_G_loss_weight (float): + Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. + feat_match_loss_weight (float): + Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108. + l1_spec_loss_weight (float): + L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. + """ + + model: str = "univnet" + batch_size: int = 32 + # model specific params + discriminator_model: str = "univnet_discriminator" + generator_model: str = "univnet_generator" + generator_model_params: dict = field( + default_factory=lambda: { + "in_channels": 64, + "out_channels": 1, + "hidden_channels": 32, + "cond_channels": 80, + "upsample_factors": [8, 8, 4], + "lvc_layers_each_block": 4, + "lvc_kernel_size": 3, + "kpnet_hidden_channels": 64, + "kpnet_conv_size": 3, + "dropout": 0.0, + } + ) + + # LOSS PARAMETERS - overrides + use_stft_loss: bool = True + use_subband_stft_loss: bool = False + use_mse_gan_loss: bool = True + use_hinge_gan_loss: bool = False + use_feat_match_loss: bool = False # requires MelGAN Discriminators (MelGAN and univnet) + use_l1_spec_loss: bool = False + + # loss weights - overrides + stft_loss_weight: float = 2.5 + stft_loss_params: dict = field( + default_factory=lambda: { + "n_ffts": [1024, 2048, 512], + "hop_lengths": [120, 240, 50], + "win_lengths": [600, 1200, 240], + } + ) + subband_stft_loss_weight: float = 0 + mse_G_loss_weight: float = 1 + hinge_G_loss_weight: float = 0 + feat_match_loss_weight: float = 0 + l1_spec_loss_weight: float = 0 + l1_spec_loss_params: dict = field( + default_factory=lambda: { + "use_mel": True, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": None, + } + ) + + # optimizer parameters + lr_gen: float = 1e-4 # Initial learning rate. + lr_disc: float = 1e-4 # Initial learning rate. + lr_scheduler_gen: str = None # one of the schedulers from https:#pytorch.org/docs/stable/optim.html + # lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1}) + lr_scheduler_disc: str = None # one of the schedulers from https:#pytorch.org/docs/stable/optim.html + # lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1}) + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.5, 0.9], "weight_decay": 0.0}) + steps_to_start_discriminator: int = 200000 + + def __post_init__(self): + super().__post_init__() + self.generator_model_params["cond_channels"] = self.audio.num_mels diff --git a/TTS/vocoder/configs/wavegrad_config.py b/TTS/vocoder/configs/wavegrad_config.py index 271422ee..c39813ae 100644 --- a/TTS/vocoder/configs/wavegrad_config.py +++ b/TTS/vocoder/configs/wavegrad_config.py @@ -1,6 +1,7 @@ from dataclasses import dataclass, field from TTS.vocoder.configs.shared_configs import BaseVocoderConfig +from TTS.vocoder.models.wavegrad import WavegradArgs @dataclass @@ -16,19 +17,7 @@ class WavegradConfig(BaseVocoderConfig): Model name used for selecting the right model at initialization. Defaults to `wavegrad`. generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is considered as a generator too. Defaults to `wavegrad`. - model_params (dict): - WaveGrad kwargs. Defaults to - ` - { - "use_weight_norm": True, - "y_conv_channels": 32, - "x_conv_channels": 768, - "ublock_out_channels": [512, 512, 256, 128, 128], - "dblock_out_channels": [128, 128, 256, 512], - "upsample_factors": [4, 4, 4, 2, 2], - "upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], - } - ` + model_params (WavegradArgs): Model parameters. Check `WavegradArgs` for default values. target_loss (str): Target loss name that defines the quality of the model. Defaults to `avg_wavegrad_loss`. epochs (int): @@ -70,18 +59,8 @@ class WavegradConfig(BaseVocoderConfig): model: str = "wavegrad" # Model specific params generator_model: str = "wavegrad" - model_params: dict = field( - default_factory=lambda: { - "use_weight_norm": True, - "y_conv_channels": 32, - "x_conv_channels": 768, - "ublock_out_channels": [512, 512, 256, 128, 128], - "dblock_out_channels": [128, 128, 256, 512], - "upsample_factors": [4, 4, 4, 2, 2], - "upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], - } - ) - target_loss: str = "avg_wavegrad_loss" # loss value to pick the best model to save after each epoch + model_params: WavegradArgs = field(default_factory=WavegradArgs) + target_loss: str = "loss" # loss value to pick the best model to save after each epoch # Training - overrides epochs: int = 10000 diff --git a/TTS/vocoder/configs/wavernn_config.py b/TTS/vocoder/configs/wavernn_config.py index 95a3cfc4..0afa1f43 100644 --- a/TTS/vocoder/configs/wavernn_config.py +++ b/TTS/vocoder/configs/wavernn_config.py @@ -1,6 +1,7 @@ from dataclasses import dataclass, field from TTS.vocoder.configs.shared_configs import BaseVocoderConfig +from TTS.vocoder.models.wavernn import WavernnArgs @dataclass @@ -47,9 +48,7 @@ class WavernnConfig(BaseVocoderConfig): Batch size used at training. Larger values use more memory. Defaults to 256. seq_len (int): Audio segment length used at training. Larger values use more memory. Defaults to 1280. - padding (int): - Padding applied to the input feature frames against the convolution layers of the feature network. - Defaults to 2. + use_noise_augment (bool): enable / disable random noise added to the input waveform. The noise is added after computing the features. Defaults to True. @@ -60,7 +59,7 @@ class WavernnConfig(BaseVocoderConfig): enable / disable mixed precision training. Default is True. eval_split_size (int): Number of samples used for evalutaion. Defaults to 50. - test_every_epoch (int): + num_epochs_before_test (int): Number of epochs waited to run the next evalution. Since inference takes some time, it is better to wait some number of epochs not ot waste training time. Defaults to 10. grad_clip (float): @@ -76,21 +75,8 @@ class WavernnConfig(BaseVocoderConfig): model: str = "wavernn" # Model specific params - mode: str = "mold" # mold [string], gauss [string], bits [int] - mulaw: bool = True # apply mulaw if mode is bits - generator_model: str = "WaveRNN" - wavernn_model_params: dict = field( - default_factory=lambda: { - "rnn_dims": 512, - "fc_dims": 512, - "compute_dims": 128, - "res_out_dims": 128, - "num_res_blocks": 10, - "use_aux_net": True, - "use_upsample_net": True, - "upsample_factors": [4, 8, 8], # this needs to correctly factorise hop_length - } - ) + model_params: WavernnArgs = field(default_factory=WavernnArgs) + target_loss: str = "loss" # Inference batched: bool = True @@ -101,12 +87,13 @@ class WavernnConfig(BaseVocoderConfig): epochs: int = 10000 batch_size: int = 256 seq_len: int = 1280 - padding: int = 2 use_noise_augment: bool = False use_cache: bool = True mixed_precision: bool = True eval_split_size: int = 50 - test_every_epochs: int = 10 # number of epochs to wait until the next test run (synthesizing a full audio clip). + num_epochs_before_test: int = ( + 10 # number of epochs to wait until the next test run (synthesizing a full audio clip). + ) # optimizer overrides grad_clip: float = 4.0 diff --git a/TTS/vocoder/datasets/__init__.py b/TTS/vocoder/datasets/__init__.py index e69de29b..86b059c3 100644 --- a/TTS/vocoder/datasets/__init__.py +++ b/TTS/vocoder/datasets/__init__.py @@ -0,0 +1,57 @@ +from typing import List + +from coqpit import Coqpit +from torch.utils.data import Dataset + +from TTS.utils.audio import AudioProcessor +from TTS.vocoder.datasets.gan_dataset import GANDataset +from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset +from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset + + +def setup_dataset(config: Coqpit, ap: AudioProcessor, is_eval: bool, data_items: List, verbose: bool) -> Dataset: + if config.model.lower() in "gan": + dataset = GANDataset( + ap=ap, + items=data_items, + seq_len=config.seq_len, + hop_len=ap.hop_length, + pad_short=config.pad_short, + conv_pad=config.conv_pad, + return_pairs=config.diff_samples_for_G_and_D if "diff_samples_for_G_and_D" in config else False, + is_training=not is_eval, + return_segments=not is_eval, + use_noise_augment=config.use_noise_augment, + use_cache=config.use_cache, + verbose=verbose, + ) + dataset.shuffle_mapping() + elif config.model.lower() == "wavegrad": + dataset = WaveGradDataset( + ap=ap, + items=data_items, + seq_len=config.seq_len, + hop_len=ap.hop_length, + pad_short=config.pad_short, + conv_pad=config.conv_pad, + is_training=not is_eval, + return_segments=True, + use_noise_augment=False, + use_cache=config.use_cache, + verbose=verbose, + ) + elif config.model.lower() == "wavernn": + dataset = WaveRNNDataset( + ap=ap, + items=data_items, + seq_len=config.seq_len, + hop_len=ap.hop_length, + pad=config.model_params.pad, + mode=config.model_params.mode, + mulaw=config.model_params.mulaw, + is_training=not is_eval, + verbose=verbose, + ) + else: + raise ValueError(f" [!] Dataset for model {config.model.lower()} cannot be found.") + return dataset diff --git a/TTS/vocoder/datasets/preprocess.py b/TTS/vocoder/datasets/preprocess.py index d99ee147..c4569b3d 100644 --- a/TTS/vocoder/datasets/preprocess.py +++ b/TTS/vocoder/datasets/preprocess.py @@ -3,10 +3,21 @@ import os from pathlib import Path import numpy as np +from coqpit import Coqpit from tqdm import tqdm +from TTS.utils.audio import AudioProcessor -def preprocess_wav_files(out_path, config, ap): + +def preprocess_wav_files(out_path: str, config: Coqpit, ap: AudioProcessor): + """Process wav and compute mel and quantized wave signal. + It is mainly used by WaveRNN dataloader. + + Args: + out_path (str): Parent folder path to save the files. + config (Coqpit): Model config. + ap (AudioProcessor): Audio processor. + """ os.makedirs(os.path.join(out_path, "quant"), exist_ok=True) os.makedirs(os.path.join(out_path, "mel"), exist_ok=True) wav_files = find_wav_files(config.data_path) @@ -18,7 +29,9 @@ def preprocess_wav_files(out_path, config, ap): mel = ap.melspectrogram(y) np.save(mel_path, mel) if isinstance(config.mode, int): - quant = ap.mulaw_encode(y, qc=config.mode) if config.mulaw else ap.quantize(y, bits=config.mode) + quant = ( + ap.mulaw_encode(y, qc=config.mode) if config.model_params.mulaw else ap.quantize(y, bits=config.mode) + ) np.save(quant_path, quant) diff --git a/TTS/vocoder/datasets/wavegrad_dataset.py b/TTS/vocoder/datasets/wavegrad_dataset.py index c0d24e84..d99fc417 100644 --- a/TTS/vocoder/datasets/wavegrad_dataset.py +++ b/TTS/vocoder/datasets/wavegrad_dataset.py @@ -136,4 +136,4 @@ class WaveGradDataset(Dataset): mels[idx, :, : mel.shape[1]] = mel audios[idx, : audio.shape[0]] = audio - return mels, audios + return audios, mels diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 1596ea8f..d648b68c 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -10,16 +10,7 @@ class WaveRNNDataset(Dataset): """ def __init__( - self, - ap, - items, - seq_len, - hop_len, - pad, - mode, - mulaw, - is_training=True, - verbose=False, + self, ap, items, seq_len, hop_len, pad, mode, mulaw, is_training=True, verbose=False, return_segments=True ): super().__init__() @@ -34,6 +25,7 @@ class WaveRNNDataset(Dataset): self.mulaw = mulaw self.is_training = is_training self.verbose = verbose + self.return_segments = return_segments assert self.seq_len % self.hop_len == 0 @@ -44,6 +36,16 @@ class WaveRNNDataset(Dataset): item = self.load_item(index) return item + def load_test_samples(self, num_samples): + samples = [] + return_segments = self.return_segments + self.return_segments = False + for idx in range(num_samples): + mel, audio, _ = self.load_item(idx) + samples.append([mel, audio]) + self.return_segments = return_segments + return samples + def load_item(self, index): """ load (audio, feat) couple if feature_path is set @@ -53,7 +55,10 @@ class WaveRNNDataset(Dataset): wavpath = self.item_list[index] audio = self.ap.load_wav(wavpath) - min_audio_len = 2 * self.seq_len + (2 * self.pad * self.hop_len) + if self.return_segments: + min_audio_len = 2 * self.seq_len + (2 * self.pad * self.hop_len) + else: + min_audio_len = audio.shape[0] + (2 * self.pad * self.hop_len) if audio.shape[0] < min_audio_len: print(" [!] Instance is too short! : {}".format(wavpath)) audio = np.pad(audio, [0, min_audio_len - audio.shape[0] + self.hop_len]) diff --git a/TTS/vocoder/layers/losses.py b/TTS/vocoder/layers/losses.py index 18076d85..848e292b 100644 --- a/TTS/vocoder/layers/losses.py +++ b/TTS/vocoder/layers/losses.py @@ -1,83 +1,11 @@ -import librosa +from typing import Dict, Union + import torch from torch import nn from torch.nn import functional as F - -class TorchSTFT(nn.Module): # pylint: disable=abstract-method - """TODO: Merge this with audio.py""" - - def __init__( - self, - n_fft, - hop_length, - win_length, - pad_wav=False, - window="hann_window", - sample_rate=None, - mel_fmin=0, - mel_fmax=None, - n_mels=80, - use_mel=False, - ): - super().__init__() - self.n_fft = n_fft - self.hop_length = hop_length - self.win_length = win_length - self.pad_wav = pad_wav - self.sample_rate = sample_rate - self.mel_fmin = mel_fmin - self.mel_fmax = mel_fmax - self.n_mels = n_mels - self.use_mel = use_mel - self.window = nn.Parameter(getattr(torch, window)(win_length), requires_grad=False) - self.mel_basis = None - if use_mel: - self._build_mel_basis() - - def __call__(self, x): - """Compute spectrogram frames by torch based stft. - - Args: - x (Tensor): input waveform - - Returns: - Tensor: spectrogram frames. - - Shapes: - x: [B x T] or [B x 1 x T] - """ - if x.ndim == 2: - x = x.unsqueeze(1) - if self.pad_wav: - padding = int((self.n_fft - self.hop_length) / 2) - x = torch.nn.functional.pad(x, (padding, padding), mode="reflect") - # B x D x T x 2 - o = torch.stft( - x.squeeze(1), - self.n_fft, - self.hop_length, - self.win_length, - self.window, - center=True, - pad_mode="reflect", # compatible with audio.py - normalized=False, - onesided=True, - return_complex=False, - ) - M = o[:, :, :, 0] - P = o[:, :, :, 1] - S = torch.sqrt(torch.clamp(M ** 2 + P ** 2, min=1e-8)) - if self.use_mel: - S = torch.matmul(self.mel_basis.to(x), S) - return S - - def _build_mel_basis(self): - mel_basis = librosa.filters.mel( - self.sample_rate, self.n_fft, n_mels=self.n_mels, fmin=self.mel_fmin, fmax=self.mel_fmax - ) - self.mel_basis = torch.from_numpy(mel_basis).float() - +from TTS.utils.audio import TorchSTFT +from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss ################################# # GENERATOR LOSSES @@ -271,7 +199,7 @@ def _apply_D_loss(scores_fake, scores_real, loss_func): loss += total_loss real_loss += real_loss fake_loss += fake_loss - # normalize loss values with number of scales + # normalize loss values with number of scales (discriminators) loss /= len(scores_fake) real_loss /= len(scores_real) fake_loss /= len(scores_fake) @@ -374,7 +302,7 @@ class GeneratorLoss(nn.Module): feat_match_loss = self.feat_match_loss(feats_fake, feats_real) return_dict["G_feat_match_loss"] = feat_match_loss adv_loss = adv_loss + self.feat_match_loss_weight * feat_match_loss - return_dict["G_loss"] = gen_loss + adv_loss + return_dict["loss"] = gen_loss + adv_loss return_dict["G_gen_loss"] = gen_loss return_dict["G_adv_loss"] = adv_loss return return_dict @@ -419,5 +347,22 @@ class DiscriminatorLoss(nn.Module): return_dict["D_hinge_gan_fake_loss"] = hinge_D_fake_loss loss += hinge_D_loss - return_dict["D_loss"] = loss + return_dict["loss"] = loss return return_dict + + +class WaveRNNLoss(nn.Module): + def __init__(self, wave_rnn_mode: Union[str, int]): + super().__init__() + if wave_rnn_mode == "mold": + self.loss_func = discretized_mix_logistic_loss + elif wave_rnn_mode == "gauss": + self.loss_func = gaussian_loss + elif isinstance(wave_rnn_mode, int): + self.loss_func = torch.nn.CrossEntropyLoss() + else: + raise ValueError(" [!] Unknown mode for Wavernn.") + + def forward(self, y_hat, y) -> Dict: + loss = self.loss_func(y_hat, y) + return {"loss": loss} diff --git a/TTS/vocoder/layers/lvc_block.py b/TTS/vocoder/layers/lvc_block.py new file mode 100644 index 00000000..0e29ee3c --- /dev/null +++ b/TTS/vocoder/layers/lvc_block.py @@ -0,0 +1,198 @@ +import torch +import torch.nn.functional as F + + +class KernelPredictor(torch.nn.Module): + """Kernel predictor for the location-variable convolutions""" + + def __init__( # pylint: disable=dangerous-default-value + self, + cond_channels, + conv_in_channels, + conv_out_channels, + conv_layers, + conv_kernel_size=3, + kpnet_hidden_channels=64, + kpnet_conv_size=3, + kpnet_dropout=0.0, + kpnet_nonlinear_activation="LeakyReLU", + kpnet_nonlinear_activation_params={"negative_slope": 0.1}, + ): + """ + Args: + cond_channels (int): number of channel for the conditioning sequence, + conv_in_channels (int): number of channel for the input sequence, + conv_out_channels (int): number of channel for the output sequence, + conv_layers (int): + kpnet_ + """ + super().__init__() + + self.conv_in_channels = conv_in_channels + self.conv_out_channels = conv_out_channels + self.conv_kernel_size = conv_kernel_size + self.conv_layers = conv_layers + + l_w = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers + l_b = conv_out_channels * conv_layers + + padding = (kpnet_conv_size - 1) // 2 + self.input_conv = torch.nn.Sequential( + torch.nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=(5 - 1) // 2, bias=True), + getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + ) + + self.residual_conv = torch.nn.Sequential( + torch.nn.Dropout(kpnet_dropout), + torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), + getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), + getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + torch.nn.Dropout(kpnet_dropout), + torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), + getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), + getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + torch.nn.Dropout(kpnet_dropout), + torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), + getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True), + getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), + ) + + self.kernel_conv = torch.nn.Conv1d(kpnet_hidden_channels, l_w, kpnet_conv_size, padding=padding, bias=True) + self.bias_conv = torch.nn.Conv1d(kpnet_hidden_channels, l_b, kpnet_conv_size, padding=padding, bias=True) + + def forward(self, c): + """ + Args: + c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) + Returns: + """ + batch, _, cond_length = c.shape + + c = self.input_conv(c) + c = c + self.residual_conv(c) + k = self.kernel_conv(c) + b = self.bias_conv(c) + + kernels = k.contiguous().view( + batch, self.conv_layers, self.conv_in_channels, self.conv_out_channels, self.conv_kernel_size, cond_length + ) + bias = b.contiguous().view(batch, self.conv_layers, self.conv_out_channels, cond_length) + return kernels, bias + + +class LVCBlock(torch.nn.Module): + """the location-variable convolutions""" + + def __init__( + self, + in_channels, + cond_channels, + upsample_ratio, + conv_layers=4, + conv_kernel_size=3, + cond_hop_length=256, + kpnet_hidden_channels=64, + kpnet_conv_size=3, + kpnet_dropout=0.0, + ): + super().__init__() + + self.cond_hop_length = cond_hop_length + self.conv_layers = conv_layers + self.conv_kernel_size = conv_kernel_size + self.convs = torch.nn.ModuleList() + + self.upsample = torch.nn.ConvTranspose1d( + in_channels, + in_channels, + kernel_size=upsample_ratio * 2, + stride=upsample_ratio, + padding=upsample_ratio // 2 + upsample_ratio % 2, + output_padding=upsample_ratio % 2, + ) + + self.kernel_predictor = KernelPredictor( + cond_channels=cond_channels, + conv_in_channels=in_channels, + conv_out_channels=2 * in_channels, + conv_layers=conv_layers, + conv_kernel_size=conv_kernel_size, + kpnet_hidden_channels=kpnet_hidden_channels, + kpnet_conv_size=kpnet_conv_size, + kpnet_dropout=kpnet_dropout, + ) + + for i in range(conv_layers): + padding = (3 ** i) * int((conv_kernel_size - 1) / 2) + conv = torch.nn.Conv1d( + in_channels, in_channels, kernel_size=conv_kernel_size, padding=padding, dilation=3 ** i + ) + + self.convs.append(conv) + + def forward(self, x, c): + """forward propagation of the location-variable convolutions. + Args: + x (Tensor): the input sequence (batch, in_channels, in_length) + c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) + + Returns: + Tensor: the output sequence (batch, in_channels, in_length) + """ + in_channels = x.shape[1] + kernels, bias = self.kernel_predictor(c) + + x = F.leaky_relu(x, 0.2) + x = self.upsample(x) + + for i in range(self.conv_layers): + y = F.leaky_relu(x, 0.2) + y = self.convs[i](y) + y = F.leaky_relu(y, 0.2) + + k = kernels[:, i, :, :, :, :] + b = bias[:, i, :, :] + y = self.location_variable_convolution(y, k, b, 1, self.cond_hop_length) + x = x + torch.sigmoid(y[:, :in_channels, :]) * torch.tanh(y[:, in_channels:, :]) + return x + + @staticmethod + def location_variable_convolution(x, kernel, bias, dilation, hop_size): + """perform location-variable convolution operation on the input sequence (x) using the local convolution kernl. + Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100. + Args: + x (Tensor): the input sequence (batch, in_channels, in_length). + kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length) + bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length) + dilation (int): the dilation of convolution. + hop_size (int): the hop_size of the conditioning sequence. + Returns: + (Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length). + """ + batch, _, in_length = x.shape + batch, _, out_channels, kernel_size, kernel_length = kernel.shape + + assert in_length == ( + kernel_length * hop_size + ), f"length of (x, kernel) is not matched, {in_length} vs {kernel_length * hop_size}" + + padding = dilation * int((kernel_size - 1) / 2) + x = F.pad(x, (padding, padding), "constant", 0) # (batch, in_channels, in_length + 2*padding) + x = x.unfold(2, hop_size + 2 * padding, hop_size) # (batch, in_channels, kernel_length, hop_size + 2*padding) + + if hop_size < dilation: + x = F.pad(x, (0, dilation), "constant", 0) + x = x.unfold( + 3, dilation, dilation + ) # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation) + x = x[:, :, :, :, :hop_size] + x = x.transpose(3, 4) # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation) + x = x.unfold(4, kernel_size, 1) # (batch, in_channels, kernel_length, dilation, _, kernel_size) + + o = torch.einsum("bildsk,biokl->bolsd", x, kernel) + o = o + bias.unsqueeze(-1).unsqueeze(-1) + o = o.contiguous().view(batch, out_channels, -1) + return o diff --git a/TTS/vocoder/models/__init__.py b/TTS/vocoder/models/__init__.py index e69de29b..9479095e 100644 --- a/TTS/vocoder/models/__init__.py +++ b/TTS/vocoder/models/__init__.py @@ -0,0 +1,153 @@ +import importlib +import re + +from coqpit import Coqpit + + +def to_camel(text): + text = text.capitalize() + return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) + + +def setup_model(config: Coqpit): + """Load models directly from configuration.""" + print(" > Vocoder Model: {}".format(config.model)) + if "discriminator_model" in config and "generator_model" in config: + MyModel = importlib.import_module("TTS.vocoder.models.gan") + MyModel = getattr(MyModel, "GAN") + else: + MyModel = importlib.import_module("TTS.vocoder.models." + config.model.lower()) + if config.model.lower() == "wavernn": + MyModel = getattr(MyModel, "Wavernn") + elif config.model.lower() == "gan": + MyModel = getattr(MyModel, "GAN") + elif config.model.lower() == "wavegrad": + MyModel = getattr(MyModel, "Wavegrad") + else: + MyModel = getattr(MyModel, to_camel(config.model)) + raise ValueError(f"Model {config.model} not exist!") + model = MyModel(config) + return model + + +def setup_generator(c): + """ TODO: use config object as arguments""" + print(" > Generator Model: {}".format(c.generator_model)) + MyModel = importlib.import_module("TTS.vocoder.models." + c.generator_model.lower()) + MyModel = getattr(MyModel, to_camel(c.generator_model)) + # this is to preserve the Wavernn class name (instead of Wavernn) + if c.generator_model.lower() in "hifigan_generator": + model = MyModel(in_channels=c.audio["num_mels"], out_channels=1, **c.generator_model_params) + elif c.generator_model.lower() in "melgan_generator": + model = MyModel( + in_channels=c.audio["num_mels"], + out_channels=1, + proj_kernel=7, + base_channels=512, + upsample_factors=c.generator_model_params["upsample_factors"], + res_kernel=3, + num_res_blocks=c.generator_model_params["num_res_blocks"], + ) + elif c.generator_model in "melgan_fb_generator": + raise ValueError("melgan_fb_generator is now fullband_melgan_generator") + elif c.generator_model.lower() in "multiband_melgan_generator": + model = MyModel( + in_channels=c.audio["num_mels"], + out_channels=4, + proj_kernel=7, + base_channels=384, + upsample_factors=c.generator_model_params["upsample_factors"], + res_kernel=3, + num_res_blocks=c.generator_model_params["num_res_blocks"], + ) + elif c.generator_model.lower() in "fullband_melgan_generator": + model = MyModel( + in_channels=c.audio["num_mels"], + out_channels=1, + proj_kernel=7, + base_channels=512, + upsample_factors=c.generator_model_params["upsample_factors"], + res_kernel=3, + num_res_blocks=c.generator_model_params["num_res_blocks"], + ) + elif c.generator_model.lower() in "parallel_wavegan_generator": + model = MyModel( + in_channels=1, + out_channels=1, + kernel_size=3, + num_res_blocks=c.generator_model_params["num_res_blocks"], + stacks=c.generator_model_params["stacks"], + res_channels=64, + gate_channels=128, + skip_channels=64, + aux_channels=c.audio["num_mels"], + dropout=0.0, + bias=True, + use_weight_norm=True, + upsample_factors=c.generator_model_params["upsample_factors"], + ) + elif c.generator_model.lower() in "univnet_generator": + model = MyModel(**c.generator_model_params) + else: + raise NotImplementedError(f"Model {c.generator_model} not implemented!") + return model + + +def setup_discriminator(c): + """ TODO: use config objekt as arguments""" + print(" > Discriminator Model: {}".format(c.discriminator_model)) + if "parallel_wavegan" in c.discriminator_model: + MyModel = importlib.import_module("TTS.vocoder.models.parallel_wavegan_discriminator") + else: + MyModel = importlib.import_module("TTS.vocoder.models." + c.discriminator_model.lower()) + MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower())) + if c.discriminator_model in "hifigan_discriminator": + model = MyModel() + if c.discriminator_model in "random_window_discriminator": + model = MyModel( + cond_channels=c.audio["num_mels"], + hop_length=c.audio["hop_length"], + uncond_disc_donwsample_factors=c.discriminator_model_params["uncond_disc_donwsample_factors"], + cond_disc_downsample_factors=c.discriminator_model_params["cond_disc_downsample_factors"], + cond_disc_out_channels=c.discriminator_model_params["cond_disc_out_channels"], + window_sizes=c.discriminator_model_params["window_sizes"], + ) + if c.discriminator_model in "melgan_multiscale_discriminator": + model = MyModel( + in_channels=1, + out_channels=1, + kernel_sizes=(5, 3), + base_channels=c.discriminator_model_params["base_channels"], + max_channels=c.discriminator_model_params["max_channels"], + downsample_factors=c.discriminator_model_params["downsample_factors"], + ) + if c.discriminator_model == "residual_parallel_wavegan_discriminator": + model = MyModel( + in_channels=1, + out_channels=1, + kernel_size=3, + num_layers=c.discriminator_model_params["num_layers"], + stacks=c.discriminator_model_params["stacks"], + res_channels=64, + gate_channels=128, + skip_channels=64, + dropout=0.0, + bias=True, + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}, + ) + if c.discriminator_model == "parallel_wavegan_discriminator": + model = MyModel( + in_channels=1, + out_channels=1, + kernel_size=3, + num_layers=c.discriminator_model_params["num_layers"], + conv_channels=64, + dilation_factor=1, + nonlinear_activation="LeakyReLU", + nonlinear_activation_params={"negative_slope": 0.2}, + bias=True, + ) + if c.discriminator_model == "univnet_discriminator": + model = MyModel() + return model diff --git a/TTS/vocoder/models/base_vocoder.py b/TTS/vocoder/models/base_vocoder.py new file mode 100644 index 00000000..f879cd42 --- /dev/null +++ b/TTS/vocoder/models/base_vocoder.py @@ -0,0 +1,20 @@ +from TTS.model import BaseModel + +# pylint: skip-file + + +class BaseVocoder(BaseModel): + """Base `vocoder` class. Every new `vocoder` model must inherit this. + + It defines `vocoder` specific functions on top of `Model`. + + Notes on input/output tensor shapes: + Any input or output tensor of the model must be shaped as + + - 3D tensors `batch x time x channels` + - 2D tensors `batch x channels` + - 1D tensors `batch x 1` + """ + + def __init__(self): + super().__init__() diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py new file mode 100644 index 00000000..39176155 --- /dev/null +++ b/TTS/vocoder/models/gan.py @@ -0,0 +1,349 @@ +from inspect import signature +from typing import Dict, List, Tuple + +import numpy as np +import torch +from coqpit import Coqpit +from torch import nn +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler + +from TTS.utils.audio import AudioProcessor +from TTS.utils.trainer_utils import get_optimizer, get_scheduler +from TTS.vocoder.datasets.gan_dataset import GANDataset +from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss +from TTS.vocoder.models import setup_discriminator, setup_generator +from TTS.vocoder.models.base_vocoder import BaseVocoder +from TTS.vocoder.utils.generic_utils import plot_results + + +class GAN(BaseVocoder): + def __init__(self, config: Coqpit): + """Wrap a generator and a discriminator network. It provides a compatible interface for the trainer. + It also helps mixing and matching different generator and disciminator networks easily. + + To implement a new GAN models, you just need to define the generator and the discriminator networks, the rest + is handled by the `GAN` class. + + Args: + config (Coqpit): Model configuration. + + Examples: + Initializing the GAN model with HifiGAN generator and discriminator. + >>> from TTS.vocoder.configs import HifiganConfig + >>> config = HifiganConfig() + >>> model = GAN(config) + """ + super().__init__() + self.config = config + self.model_g = setup_generator(config) + self.model_d = setup_discriminator(config) + self.train_disc = False # if False, train only the generator. + self.y_hat_g = None # the last generator prediction to be passed onto the discriminator + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Run the generator's forward pass. + + Args: + x (torch.Tensor): Input tensor. + + Returns: + torch.Tensor: output of the GAN generator network. + """ + return self.model_g.forward(x) + + def inference(self, x: torch.Tensor) -> torch.Tensor: + """Run the generator's inference pass. + + Args: + x (torch.Tensor): Input tensor. + Returns: + torch.Tensor: output of the GAN generator network. + """ + return self.model_g.inference(x) + + def train_step(self, batch: Dict, criterion: Dict, optimizer_idx: int) -> Tuple[Dict, Dict]: + """Compute model outputs and the loss values. `optimizer_idx` selects the generator or the discriminator for + network on the current pass. + + Args: + batch (Dict): Batch of samples returned by the dataloader. + criterion (Dict): Criterion used to compute the losses. + optimizer_idx (int): ID of the optimizer in use on the current pass. + + Raises: + ValueError: `optimizer_idx` is an unexpected value. + + Returns: + Tuple[Dict, Dict]: model outputs and the computed loss values. + """ + outputs = None + loss_dict = None + + x = batch["input"] + y = batch["waveform"] + + if optimizer_idx not in [0, 1]: + raise ValueError(" [!] Unexpected `optimizer_idx`.") + + if optimizer_idx == 0: + # GENERATOR + # generator pass + y_hat = self.model_g(x)[:, :, : y.size(2)] + self.y_hat_g = y_hat # save for discriminator + y_hat_sub = None + y_sub = None + + # PQMF formatting + if y_hat.shape[1] > 1: + y_hat_sub = y_hat + y_hat = self.model_g.pqmf_synthesis(y_hat) + self.y_hat_g = y_hat # save for discriminator + y_sub = self.model_g.pqmf_analysis(y) + + scores_fake, feats_fake, feats_real = None, None, None + if self.train_disc: + + if len(signature(self.model_d.forward).parameters) == 2: + D_out_fake = self.model_d(y_hat, x) + else: + D_out_fake = self.model_d(y_hat) + D_out_real = None + + if self.config.use_feat_match_loss: + with torch.no_grad(): + D_out_real = self.model_d(y) + + # format D outputs + if isinstance(D_out_fake, tuple): + scores_fake, feats_fake = D_out_fake + if D_out_real is None: + feats_real = None + else: + _, feats_real = D_out_real + else: + scores_fake = D_out_fake + feats_fake, feats_real = None, None + + # compute losses + loss_dict = criterion[optimizer_idx](y_hat, y, scores_fake, feats_fake, feats_real, y_hat_sub, y_sub) + outputs = {"model_outputs": y_hat} + + if optimizer_idx == 1: + # DISCRIMINATOR + if self.train_disc: + # use different samples for G and D trainings + if self.config.diff_samples_for_G_and_D: + x_d = batch["input_disc"] + y_d = batch["waveform_disc"] + # use a different sample than generator + with torch.no_grad(): + y_hat = self.model_g(x_d) + + # PQMF formatting + if y_hat.shape[1] > 1: + y_hat = self.model_g.pqmf_synthesis(y_hat) + else: + # use the same samples as generator + x_d = x.clone() + y_d = y.clone() + y_hat = self.y_hat_g + + # run D with or without cond. features + if len(signature(self.model_d.forward).parameters) == 2: + D_out_fake = self.model_d(y_hat.detach().clone(), x_d) + D_out_real = self.model_d(y_d, x_d) + else: + D_out_fake = self.model_d(y_hat.detach()) + D_out_real = self.model_d(y_d) + + # format D outputs + if isinstance(D_out_fake, tuple): + # self.model_d returns scores and features + scores_fake, feats_fake = D_out_fake + if D_out_real is None: + scores_real, feats_real = None, None + else: + scores_real, feats_real = D_out_real + else: + # model D returns only scores + scores_fake = D_out_fake + scores_real = D_out_real + + # compute losses + loss_dict = criterion[optimizer_idx](scores_fake, scores_real) + outputs = {"model_outputs": y_hat} + + return outputs, loss_dict + + @staticmethod + def _log(name: str, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, Dict]: + """Logging shared by the training and evaluation. + + Args: + name (str): Name of the run. `train` or `eval`, + ap (AudioProcessor): Audio processor used in training. + batch (Dict): Batch used in the last train/eval step. + outputs (Dict): Model outputs from the last train/eval step. + + Returns: + Tuple[Dict, Dict]: log figures and audio samples. + """ + y_hat = outputs[0]["model_outputs"] + y = batch["waveform"] + figures = plot_results(y_hat, y, ap, name) + sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() + audios = {f"{name}/audio": sample_voice} + return figures, audios + + def train_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + """Call `_log()` for training.""" + return self._log("train", ap, batch, outputs) + + @torch.no_grad() + def eval_step(self, batch: Dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]: + """Call `train_step()` with `no_grad()`""" + return self.train_step(batch, criterion, optimizer_idx) + + def eval_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + """Call `_log()` for evaluation.""" + return self._log("eval", ap, batch, outputs) + + def load_checkpoint( + self, + config: Coqpit, + checkpoint_path: str, + eval: bool = False, # pylint: disable=unused-argument, redefined-builtin + ) -> None: + """Load a GAN checkpoint and initialize model parameters. + + Args: + config (Coqpit): Model config. + checkpoint_path (str): Checkpoint file path. + eval (bool, optional): If true, load the model for inference. If falseDefaults to False. + """ + state = torch.load(checkpoint_path, map_location=torch.device("cpu")) + # band-aid for older than v0.0.15 GAN models + if "model_disc" in state: + self.model_g.load_checkpoint(config, checkpoint_path, eval) + else: + self.load_state_dict(state["model"]) + if eval: + self.model_d = None + if hasattr(self.model_g, "remove_weight_norm"): + self.model_g.remove_weight_norm() + + def on_train_step_start(self, trainer) -> None: + """Enable the discriminator training based on `steps_to_start_discriminator` + + Args: + trainer (Trainer): Trainer object. + """ + self.train_disc = trainer.total_steps_done >= self.config.steps_to_start_discriminator + + def get_optimizer(self) -> List: + """Initiate and return the GAN optimizers based on the config parameters. + + It returnes 2 optimizers in a list. First one is for the generator and the second one is for the discriminator. + + Returns: + List: optimizers. + """ + optimizer1 = get_optimizer( + self.config.optimizer, self.config.optimizer_params, self.config.lr_gen, self.model_g + ) + optimizer2 = get_optimizer( + self.config.optimizer, self.config.optimizer_params, self.config.lr_disc, self.model_d + ) + return [optimizer1, optimizer2] + + def get_lr(self) -> List: + """Set the initial learning rates for each optimizer. + + Returns: + List: learning rates for each optimizer. + """ + return [self.config.lr_gen, self.config.lr_disc] + + def get_scheduler(self, optimizer) -> List: + """Set the schedulers for each optimizer. + + Args: + optimizer (List[`torch.optim.Optimizer`]): List of optimizers. + + Returns: + List: Schedulers, one for each optimizer. + """ + scheduler1 = get_scheduler(self.config.lr_scheduler_gen, self.config.lr_scheduler_gen_params, optimizer[0]) + scheduler2 = get_scheduler(self.config.lr_scheduler_disc, self.config.lr_scheduler_disc_params, optimizer[1]) + return [scheduler1, scheduler2] + + @staticmethod + def format_batch(batch: List) -> Dict: + """Format the batch for training. + + Args: + batch (List): Batch out of the dataloader. + + Returns: + Dict: formatted model inputs. + """ + if isinstance(batch[0], list): + x_G, y_G = batch[0] + x_D, y_D = batch[1] + return {"input": x_G, "waveform": y_G, "input_disc": x_D, "waveform_disc": y_D} + x, y = batch + return {"input": x, "waveform": y} + + def get_data_loader( # pylint: disable=no-self-use + self, + config: Coqpit, + ap: AudioProcessor, + is_eval: True, + data_items: List, + verbose: bool, + num_gpus: int, + ): + """Initiate and return the GAN dataloader. + + Args: + config (Coqpit): Model config. + ap (AudioProcessor): Audio processor. + is_eval (True): Set the dataloader for evaluation if true. + data_items (List): Data samples. + verbose (bool): Log information if true. + num_gpus (int): Number of GPUs in use. + + Returns: + DataLoader: Torch dataloader. + """ + dataset = GANDataset( + ap=ap, + items=data_items, + seq_len=config.seq_len, + hop_len=ap.hop_length, + pad_short=config.pad_short, + conv_pad=config.conv_pad, + return_pairs=config.diff_samples_for_G_and_D if "diff_samples_for_G_and_D" in config else False, + is_training=not is_eval, + return_segments=not is_eval, + use_noise_augment=config.use_noise_augment, + use_cache=config.use_cache, + verbose=verbose, + ) + dataset.shuffle_mapping() + sampler = DistributedSampler(dataset, shuffle=True) if num_gpus > 1 else None + loader = DataLoader( + dataset, + batch_size=1 if is_eval else config.batch_size, + shuffle=num_gpus == 0, + drop_last=False, + sampler=sampler, + num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers, + pin_memory=False, + ) + return loader + + def get_criterion(self): + """Return criterions for the optimizers""" + return [GeneratorLoss(self.config), DiscriminatorLoss(self.config)] diff --git a/TTS/vocoder/models/hifigan_generator.py b/TTS/vocoder/models/hifigan_generator.py index 8d595a63..f606c649 100644 --- a/TTS/vocoder/models/hifigan_generator.py +++ b/TTS/vocoder/models/hifigan_generator.py @@ -15,7 +15,7 @@ def get_padding(k, d): class ResBlock1(torch.nn.Module): """Residual Block Type 1. It has 3 convolutional layers in each convolutiona block. - Network: + Network:: x -> lrelu -> conv1_1 -> conv1_2 -> conv1_3 -> z -> lrelu -> conv2_1 -> conv2_2 -> conv2_3 -> o -> + -> o |--------------------------------------------------------------------------------------------------| @@ -105,7 +105,7 @@ class ResBlock1(torch.nn.Module): class ResBlock2(torch.nn.Module): """Residual Block Type 1. It has 3 convolutional layers in each convolutiona block. - Network: + Network:: x -> lrelu -> conv1-> -> z -> lrelu -> conv2-> o -> + -> o |---------------------------------------------------| diff --git a/TTS/vocoder/models/univnet_discriminator.py b/TTS/vocoder/models/univnet_discriminator.py new file mode 100644 index 00000000..d99b2760 --- /dev/null +++ b/TTS/vocoder/models/univnet_discriminator.py @@ -0,0 +1,96 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.utils import spectral_norm, weight_norm + +from TTS.utils.audio import TorchSTFT +from TTS.vocoder.models.hifigan_discriminator import MultiPeriodDiscriminator + +LRELU_SLOPE = 0.1 + + +class SpecDiscriminator(nn.Module): + """docstring for Discriminator.""" + + def __init__(self, fft_size=1024, hop_length=120, win_length=600, use_spectral_norm=False): + super().__init__() + norm_f = weight_norm if use_spectral_norm is False else spectral_norm + self.fft_size = fft_size + self.hop_length = hop_length + self.win_length = win_length + self.stft = TorchSTFT(fft_size, hop_length, win_length) + self.discriminators = nn.ModuleList( + [ + norm_f(nn.Conv2d(1, 32, kernel_size=(3, 9), padding=(1, 4))), + norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))), + norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))), + norm_f(nn.Conv2d(32, 32, kernel_size=(3, 9), stride=(1, 2), padding=(1, 4))), + norm_f(nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))), + ] + ) + + self.out = norm_f(nn.Conv2d(32, 1, 3, 1, 1)) + + def forward(self, y): + + fmap = [] + with torch.no_grad(): + y = y.squeeze(1) + y = self.stft(y) + y = y.unsqueeze(1) + for _, d in enumerate(self.discriminators): + y = d(y) + y = F.leaky_relu(y, LRELU_SLOPE) + fmap.append(y) + + y = self.out(y) + fmap.append(y) + + return torch.flatten(y, 1, -1), fmap + + +class MultiResSpecDiscriminator(torch.nn.Module): + def __init__( # pylint: disable=dangerous-default-value + self, fft_sizes=[1024, 2048, 512], hop_sizes=[120, 240, 50], win_lengths=[600, 1200, 240], window="hann_window" + ): + + super().__init__() + self.discriminators = nn.ModuleList( + [ + SpecDiscriminator(fft_sizes[0], hop_sizes[0], win_lengths[0], window), + SpecDiscriminator(fft_sizes[1], hop_sizes[1], win_lengths[1], window), + SpecDiscriminator(fft_sizes[2], hop_sizes[2], win_lengths[2], window), + ] + ) + + def forward(self, x): + scores = [] + feats = [] + for d in self.discriminators: + score, feat = d(x) + scores.append(score) + feats.append(feat) + + return scores, feats + + +class UnivnetDiscriminator(nn.Module): + """Univnet discriminator wrapping MPD and MSD.""" + + def __init__(self): + super().__init__() + self.mpd = MultiPeriodDiscriminator() + self.msd = MultiResSpecDiscriminator() + + def forward(self, x): + """ + Args: + x (Tensor): input waveform. + + Returns: + List[Tensor]: discriminator scores. + List[List[Tensor]]: list of list of features from each layers of each discriminator. + """ + scores, feats = self.mpd(x) + scores_, feats_ = self.msd(x) + return scores + scores_, feats + feats_ diff --git a/TTS/vocoder/models/univnet_generator.py b/TTS/vocoder/models/univnet_generator.py new file mode 100644 index 00000000..0a6bd4c8 --- /dev/null +++ b/TTS/vocoder/models/univnet_generator.py @@ -0,0 +1,137 @@ +import numpy as np +import torch +import torch.nn.functional as F + +from TTS.vocoder.layers.lvc_block import LVCBlock + +LRELU_SLOPE = 0.1 + + +class UnivnetGenerator(torch.nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + cond_channels, + upsample_factors, + lvc_layers_each_block, + lvc_kernel_size, + kpnet_hidden_channels, + kpnet_conv_size, + dropout, + use_weight_norm=True, + ): + + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.cond_channels = cond_channels + self.upsample_scale = np.prod(upsample_factors) + self.lvc_block_nums = len(upsample_factors) + + # define first convolution + self.first_conv = torch.nn.Conv1d( + in_channels, hidden_channels, kernel_size=7, padding=(7 - 1) // 2, dilation=1, bias=True + ) + + # define residual blocks + self.lvc_blocks = torch.nn.ModuleList() + cond_hop_length = 1 + for n in range(self.lvc_block_nums): + cond_hop_length = cond_hop_length * upsample_factors[n] + lvcb = LVCBlock( + in_channels=hidden_channels, + cond_channels=cond_channels, + upsample_ratio=upsample_factors[n], + conv_layers=lvc_layers_each_block, + conv_kernel_size=lvc_kernel_size, + cond_hop_length=cond_hop_length, + kpnet_hidden_channels=kpnet_hidden_channels, + kpnet_conv_size=kpnet_conv_size, + kpnet_dropout=dropout, + ) + self.lvc_blocks += [lvcb] + + # define output layers + self.last_conv_layers = torch.nn.ModuleList( + [ + torch.nn.Conv1d( + hidden_channels, out_channels, kernel_size=7, padding=(7 - 1) // 2, dilation=1, bias=True + ), + ] + ) + + # apply weight norm + if use_weight_norm: + self.apply_weight_norm() + + def forward(self, c): + """Calculate forward propagation. + Args: + c (Tensor): Local conditioning auxiliary features (B, C ,T'). + Returns: + Tensor: Output tensor (B, out_channels, T) + """ + # random noise + x = torch.randn([c.shape[0], self.in_channels, c.shape[2]]) + x = x.to(self.first_conv.bias.device) + x = self.first_conv(x) + + for n in range(self.lvc_block_nums): + x = self.lvc_blocks[n](x, c) + + # apply final layers + for f in self.last_conv_layers: + x = F.leaky_relu(x, LRELU_SLOPE) + x = f(x) + x = torch.tanh(x) + return x + + def remove_weight_norm(self): + """Remove weight normalization module from all of the layers.""" + + def _remove_weight_norm(m): + try: + # print(f"Weight norm is removed from {m}.") + torch.nn.utils.remove_weight_norm(m) + except ValueError: # this module didn't have weight norm + return + + self.apply(_remove_weight_norm) + + def apply_weight_norm(self): + """Apply weight normalization module from all of the layers.""" + + def _apply_weight_norm(m): + if isinstance(m, (torch.nn.Conv1d, torch.nn.Conv2d)): + torch.nn.utils.weight_norm(m) + # print(f"Weight norm is applied to {m}.") + + self.apply(_apply_weight_norm) + + @staticmethod + def _get_receptive_field_size(layers, stacks, kernel_size, dilation=lambda x: 2 ** x): + assert layers % stacks == 0 + layers_per_cycle = layers // stacks + dilations = [dilation(i % layers_per_cycle) for i in range(layers)] + return (kernel_size - 1) * sum(dilations) + 1 + + @property + def receptive_field_size(self): + """Return receptive field size.""" + return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size) + + @torch.no_grad() + def inference(self, c): + """Perform inference. + Args: + c (Tensor): Local conditioning auxiliary features :math:`(B, C, T)`. + Returns: + Tensor: Output tensor (T, out_channels) + """ + x = torch.randn([c.shape[0], self.in_channels, c.shape[2]]) + x = x.to(self.first_conv.bias.device) + + c = c.to(next(self.parameters())) + return self.forward(c) diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py index 84dde957..03d5160e 100644 --- a/TTS/vocoder/models/wavegrad.py +++ b/TTS/vocoder/models/wavegrad.py @@ -1,65 +1,105 @@ +from dataclasses import dataclass, field +from typing import Dict, List, Tuple + import numpy as np import torch +from coqpit import Coqpit from torch import nn from torch.nn.utils import weight_norm +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler -from ..layers.wavegrad import Conv1d, DBlock, FiLM, UBlock +from TTS.model import BaseModel +from TTS.utils.audio import AudioProcessor +from TTS.utils.trainer_utils import get_optimizer, get_scheduler +from TTS.vocoder.datasets import WaveGradDataset +from TTS.vocoder.layers.wavegrad import Conv1d, DBlock, FiLM, UBlock +from TTS.vocoder.utils.generic_utils import plot_results -class Wavegrad(nn.Module): +@dataclass +class WavegradArgs(Coqpit): + in_channels: int = 80 + out_channels: int = 1 + use_weight_norm: bool = False + y_conv_channels: int = 32 + x_conv_channels: int = 768 + dblock_out_channels: List[int] = field(default_factory=lambda: [128, 128, 256, 512]) + ublock_out_channels: List[int] = field(default_factory=lambda: [512, 512, 256, 128, 128]) + upsample_factors: List[int] = field(default_factory=lambda: [4, 4, 4, 2, 2]) + upsample_dilations: List[List[int]] = field( + default_factory=lambda: [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]] + ) + + +class Wavegrad(BaseModel): + """🐸 🌊 WaveGrad 🌊 model. + Paper - https://arxiv.org/abs/2009.00713 + + Examples: + Initializing the model. + + >>> from TTS.vocoder.configs import WavegradConfig + >>> config = WavegradConfig() + >>> model = Wavegrad(config) + + Paper Abstract: + This paper introduces WaveGrad, a conditional model for waveform generation which estimates gradients of the + data density. The model is built on prior work on score matching and diffusion probabilistic models. It starts + from a Gaussian white noise signal and iteratively refines the signal via a gradient-based sampler conditioned + on the mel-spectrogram. WaveGrad offers a natural way to trade inference speed for sample quality by adjusting + the number of refinement steps, and bridges the gap between non-autoregressive and autoregressive models in + terms of audio quality. We find that it can generate high fidelity audio samples using as few as six iterations. + Experiments reveal WaveGrad to generate high fidelity audio, outperforming adversarial non-autoregressive + baselines and matching a strong likelihood-based autoregressive baseline using fewer sequential operations. + Audio samples are available at this https URL. + """ + # pylint: disable=dangerous-default-value - def __init__( - self, - in_channels=80, - out_channels=1, - use_weight_norm=False, - y_conv_channels=32, - x_conv_channels=768, - dblock_out_channels=[128, 128, 256, 512], - ublock_out_channels=[512, 512, 256, 128, 128], - upsample_factors=[5, 5, 3, 2, 2], - upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], - ): + def __init__(self, config: Coqpit): super().__init__() - - self.use_weight_norm = use_weight_norm - self.hop_len = np.prod(upsample_factors) + self.config = config + self.use_weight_norm = config.model_params.use_weight_norm + self.hop_len = np.prod(config.model_params.upsample_factors) self.noise_level = None self.num_steps = None self.beta = None self.alpha = None self.alpha_hat = None - self.noise_level = None self.c1 = None self.c2 = None self.sigma = None # dblocks - self.y_conv = Conv1d(1, y_conv_channels, 5, padding=2) + self.y_conv = Conv1d(1, config.model_params.y_conv_channels, 5, padding=2) self.dblocks = nn.ModuleList([]) - ic = y_conv_channels - for oc, df in zip(dblock_out_channels, reversed(upsample_factors)): + ic = config.model_params.y_conv_channels + for oc, df in zip(config.model_params.dblock_out_channels, reversed(config.model_params.upsample_factors)): self.dblocks.append(DBlock(ic, oc, df)) ic = oc # film self.film = nn.ModuleList([]) - ic = y_conv_channels - for oc in reversed(ublock_out_channels): + ic = config.model_params.y_conv_channels + for oc in reversed(config.model_params.ublock_out_channels): self.film.append(FiLM(ic, oc)) ic = oc - # ublocks + # ublocksn self.ublocks = nn.ModuleList([]) - ic = x_conv_channels - for oc, uf, ud in zip(ublock_out_channels, upsample_factors, upsample_dilations): + ic = config.model_params.x_conv_channels + for oc, uf, ud in zip( + config.model_params.ublock_out_channels, + config.model_params.upsample_factors, + config.model_params.upsample_dilations, + ): self.ublocks.append(UBlock(ic, oc, uf, ud)) ic = oc - self.x_conv = Conv1d(in_channels, x_conv_channels, 3, padding=1) - self.out_conv = Conv1d(oc, out_channels, 3, padding=1) + self.x_conv = Conv1d(config.model_params.in_channels, config.model_params.x_conv_channels, 3, padding=1) + self.out_conv = Conv1d(oc, config.model_params.out_channels, 3, padding=1) - if use_weight_norm: + if config.model_params.use_weight_norm: self.apply_weight_norm() def forward(self, x, spectrogram, noise_scale): @@ -180,7 +220,7 @@ class Wavegrad(nn.Module): if eval: self.eval() assert not self.training - if self.use_weight_norm: + if self.config.model_params.use_weight_norm: self.remove_weight_norm() betas = np.linspace( config["test_noise_schedule"]["min_val"], @@ -195,3 +235,93 @@ class Wavegrad(nn.Module): config["train_noise_schedule"]["num_steps"], ) self.compute_noise_level(betas) + + def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: + # format data + x = batch["input"] + y = batch["waveform"] + + # set noise scale + noise, x_noisy, noise_scale = self.compute_y_n(y) + + # forward pass + noise_hat = self.forward(x_noisy, x, noise_scale) + + # compute losses + loss = criterion(noise, noise_hat) + return {"model_output": noise_hat}, {"loss": loss} + + def train_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + return None, None + + @torch.no_grad() + def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: + return self.train_step(batch, criterion) + + def eval_log(self, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, np.ndarray]: + return None, None + + def test_run(self, ap: AudioProcessor, samples: List[Dict], ouputs: Dict): # pylint: disable=unused-argument + # setup noise schedule and inference + noise_schedule = self.config["test_noise_schedule"] + betas = np.linspace(noise_schedule["min_val"], noise_schedule["max_val"], noise_schedule["num_steps"]) + self.compute_noise_level(betas) + for sample in samples: + x = sample["input"] + y = sample["waveform"] + # compute voice + y_pred = self.inference(x) + # compute spectrograms + figures = plot_results(y_pred, y, ap, "test") + # Sample audio + sample_voice = y_pred[0].squeeze(0).detach().cpu().numpy() + return figures, {"test/audio": sample_voice} + + def get_optimizer(self): + return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self) + + def get_scheduler(self, optimizer): + return get_scheduler(self.config.lr_scheduler, self.config.lr_scheduler_params, optimizer) + + def get_criterion(self): + return torch.nn.L1Loss() + + @staticmethod + def format_batch(batch: Dict) -> Dict: + # return a whole audio segment + m, y = batch[0], batch[1] + y = y.unsqueeze(1) + return {"input": m, "waveform": y} + + def get_data_loader( + self, config: Coqpit, ap: AudioProcessor, is_eval: True, data_items: List, verbose: bool, num_gpus: int + ): + dataset = WaveGradDataset( + ap=ap, + items=data_items, + seq_len=self.config.seq_len, + hop_len=ap.hop_length, + pad_short=self.config.pad_short, + conv_pad=self.config.conv_pad, + is_training=not is_eval, + return_segments=True, + use_noise_augment=False, + use_cache=config.use_cache, + verbose=verbose, + ) + sampler = DistributedSampler(dataset) if num_gpus > 1 else None + loader = DataLoader( + dataset, + batch_size=self.config.batch_size, + shuffle=num_gpus <= 1, + drop_last=False, + sampler=sampler, + num_workers=self.config.num_eval_loader_workers if is_eval else self.config.num_loader_workers, + pin_memory=False, + ) + return loader + + def on_epoch_start(self, trainer): # pylint: disable=unused-argument + noise_schedule = self.config["train_noise_schedule"] + betas = np.linspace(noise_schedule["min_val"], noise_schedule["max_val"], noise_schedule["num_steps"]) + self.compute_noise_level(betas) diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 994244dc..a5d89d5a 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -1,13 +1,21 @@ import sys import time +from dataclasses import dataclass, field +from typing import Dict, List, Tuple import numpy as np import torch import torch.nn as nn import torch.nn.functional as F +from coqpit import Coqpit +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler -# fix this -from TTS.utils.audio import AudioProcessor as ap +from TTS.tts.utils.visual import plot_spectrogram +from TTS.utils.audio import AudioProcessor +from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset +from TTS.vocoder.layers.losses import WaveRNNLoss +from TTS.vocoder.models.base_vocoder import BaseVocoder from TTS.vocoder.utils.distribution import sample_from_discretized_mix_logistic, sample_from_gaussian @@ -135,89 +143,145 @@ class Upsample(nn.Module): return m.transpose(1, 2), aux -class WaveRNN(nn.Module): - def __init__( - self, - rnn_dims, - fc_dims, - mode, - mulaw, - pad, - use_aux_net, - use_upsample_net, - upsample_factors, - feat_dims, - compute_dims, - res_out_dims, - num_res_blocks, - hop_length, - sample_rate, - ): +@dataclass +class WavernnArgs(Coqpit): + """🐸 WaveRNN model arguments. + + rnn_dims (int): + Number of hidden channels in RNN layers. Defaults to 512. + fc_dims (int): + Number of hidden channels in fully-conntected layers. Defaults to 512. + compute_dims (int): + Number of hidden channels in the feature ResNet. Defaults to 128. + res_out_dim (int): + Number of hidden channels in the feature ResNet output. Defaults to 128. + num_res_blocks (int): + Number of residual blocks in the ResNet. Defaults to 10. + use_aux_net (bool): + enable/disable the feature ResNet. Defaults to True. + use_upsample_net (bool): + enable/ disable the upsampling networl. If False, basic upsampling is used. Defaults to True. + upsample_factors (list): + Upsampling factors. The multiply of the values must match the `hop_length`. Defaults to ```[4, 8, 8]```. + mode (str): + Output mode of the WaveRNN vocoder. `mold` for Mixture of Logistic Distribution, `gauss` for a single + Gaussian Distribution and `bits` for quantized bits as the model's output. + mulaw (bool): + enable / disable the use of Mulaw quantization for training. Only applicable if `mode == 'bits'`. Defaults + to `True`. + pad (int): + Padding applied to the input feature frames against the convolution layers of the feature network. + Defaults to 2. + """ + + rnn_dims: int = 512 + fc_dims: int = 512 + compute_dims: int = 128 + res_out_dims: int = 128 + num_res_blocks: int = 10 + use_aux_net: bool = True + use_upsample_net: bool = True + upsample_factors: List[int] = field(default_factory=lambda: [4, 8, 8]) + mode: str = "mold" # mold [string], gauss [string], bits [int] + mulaw: bool = True # apply mulaw if mode is bits + pad: int = 2 + feat_dims: int = 80 + + +class Wavernn(BaseVocoder): + def __init__(self, config: Coqpit): + """🐸 WaveRNN model. + Original paper - https://arxiv.org/abs/1802.08435 + Official implementation - https://github.com/fatchord/WaveRNN + + Args: + config (Coqpit): [description] + + Raises: + RuntimeError: [description] + + Examples: + >>> from TTS.vocoder.configs import WavernnConfig + >>> config = WavernnConfig() + >>> model = Wavernn(config) + + Paper Abstract: + Sequential models achieve state-of-the-art results in audio, visual and textual domains with respect to + both estimating the data distribution and generating high-quality samples. Efficient sampling for this + class of models has however remained an elusive problem. With a focus on text-to-speech synthesis, we + describe a set of general techniques for reducing sampling time while maintaining high output quality. + We first describe a single-layer recurrent neural network, the WaveRNN, with a dual softmax layer that + matches the quality of the state-of-the-art WaveNet model. The compact form of the network makes it + possible to generate 24kHz 16-bit audio 4x faster than real time on a GPU. Second, we apply a weight + pruning technique to reduce the number of weights in the WaveRNN. We find that, for a constant number of + parameters, large sparse networks perform better than small dense networks and this relationship holds for + sparsity levels beyond 96%. The small number of weights in a Sparse WaveRNN makes it possible to sample + high-fidelity audio on a mobile CPU in real time. Finally, we propose a new generation scheme based on + subscaling that folds a long sequence into a batch of shorter sequences and allows one to generate multiple + samples at once. The Subscale WaveRNN produces 16 samples per step without loss of quality and offers an + orthogonal method for increasing sampling efficiency. + """ super().__init__() - self.mode = mode - self.mulaw = mulaw - self.pad = pad - self.use_upsample_net = use_upsample_net - self.use_aux_net = use_aux_net - if isinstance(self.mode, int): - self.n_classes = 2 ** self.mode - elif self.mode == "mold": + + self.args = config.model_params + self.config = config + + if isinstance(self.args.mode, int): + self.n_classes = 2 ** self.args.mode + elif self.args.mode == "mold": self.n_classes = 3 * 10 - elif self.mode == "gauss": + elif self.args.mode == "gauss": self.n_classes = 2 else: - raise RuntimeError("Unknown model mode value - ", self.mode) + raise RuntimeError("Unknown model mode value - ", self.args.mode) - self.rnn_dims = rnn_dims - self.aux_dims = res_out_dims // 4 - self.hop_length = hop_length - self.sample_rate = sample_rate + self.aux_dims = self.args.res_out_dims // 4 - if self.use_upsample_net: + if self.args.use_upsample_net: assert ( - np.cumproduct(upsample_factors)[-1] == self.hop_length + np.cumproduct(self.args.upsample_factors)[-1] == config.audio.hop_length ), " [!] upsample scales needs to be equal to hop_length" self.upsample = UpsampleNetwork( - feat_dims, - upsample_factors, - compute_dims, - num_res_blocks, - res_out_dims, - pad, - use_aux_net, + self.args.feat_dims, + self.args.upsample_factors, + self.args.compute_dims, + self.args.num_res_blocks, + self.args.res_out_dims, + self.args.pad, + self.args.use_aux_net, ) else: self.upsample = Upsample( - hop_length, - pad, - num_res_blocks, - feat_dims, - compute_dims, - res_out_dims, - use_aux_net, + config.audio.hop_length, + self.args.pad, + self.args.num_res_blocks, + self.args.feat_dims, + self.args.compute_dims, + self.args.res_out_dims, + self.args.use_aux_net, ) - if self.use_aux_net: - self.I = nn.Linear(feat_dims + self.aux_dims + 1, rnn_dims) - self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True) - self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims, batch_first=True) - self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims) - self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims) - self.fc3 = nn.Linear(fc_dims, self.n_classes) + if self.args.use_aux_net: + self.I = nn.Linear(self.args.feat_dims + self.aux_dims + 1, self.args.rnn_dims) + self.rnn1 = nn.GRU(self.args.rnn_dims, self.args.rnn_dims, batch_first=True) + self.rnn2 = nn.GRU(self.args.rnn_dims + self.aux_dims, self.args.rnn_dims, batch_first=True) + self.fc1 = nn.Linear(self.args.rnn_dims + self.aux_dims, self.args.fc_dims) + self.fc2 = nn.Linear(self.args.fc_dims + self.aux_dims, self.args.fc_dims) + self.fc3 = nn.Linear(self.args.fc_dims, self.n_classes) else: - self.I = nn.Linear(feat_dims + 1, rnn_dims) - self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True) - self.rnn2 = nn.GRU(rnn_dims, rnn_dims, batch_first=True) - self.fc1 = nn.Linear(rnn_dims, fc_dims) - self.fc2 = nn.Linear(fc_dims, fc_dims) - self.fc3 = nn.Linear(fc_dims, self.n_classes) + self.I = nn.Linear(self.args.feat_dims + 1, self.args.rnn_dims) + self.rnn1 = nn.GRU(self.args.rnn_dims, self.args.rnn_dims, batch_first=True) + self.rnn2 = nn.GRU(self.args.rnn_dims, self.args.rnn_dims, batch_first=True) + self.fc1 = nn.Linear(self.args.rnn_dims, self.args.fc_dims) + self.fc2 = nn.Linear(self.args.fc_dims, self.args.fc_dims) + self.fc3 = nn.Linear(self.args.fc_dims, self.n_classes) def forward(self, x, mels): bsize = x.size(0) - h1 = torch.zeros(1, bsize, self.rnn_dims).to(x.device) - h2 = torch.zeros(1, bsize, self.rnn_dims).to(x.device) + h1 = torch.zeros(1, bsize, self.args.rnn_dims).to(x.device) + h2 = torch.zeros(1, bsize, self.args.rnn_dims).to(x.device) mels, aux = self.upsample(mels) - if self.use_aux_net: + if self.args.use_aux_net: aux_idx = [self.aux_dims * i for i in range(5)] a1 = aux[:, :, aux_idx[0] : aux_idx[1]] a2 = aux[:, :, aux_idx[1] : aux_idx[2]] @@ -226,7 +290,7 @@ class WaveRNN(nn.Module): x = ( torch.cat([x.unsqueeze(-1), mels, a1], dim=2) - if self.use_aux_net + if self.args.use_aux_net else torch.cat([x.unsqueeze(-1), mels], dim=2) ) x = self.I(x) @@ -236,22 +300,21 @@ class WaveRNN(nn.Module): x = x + res res = x - x = torch.cat([x, a2], dim=2) if self.use_aux_net else x + x = torch.cat([x, a2], dim=2) if self.args.use_aux_net else x self.rnn2.flatten_parameters() x, _ = self.rnn2(x, h2) x = x + res - x = torch.cat([x, a3], dim=2) if self.use_aux_net else x + x = torch.cat([x, a3], dim=2) if self.args.use_aux_net else x x = F.relu(self.fc1(x)) - x = torch.cat([x, a4], dim=2) if self.use_aux_net else x + x = torch.cat([x, a4], dim=2) if self.args.use_aux_net else x x = F.relu(self.fc2(x)) return self.fc3(x) def inference(self, mels, batched=None, target=None, overlap=None): self.eval() - device = mels.device output = [] start = time.time() rnn1 = self.get_gru_cell(self.rnn1) @@ -259,13 +322,13 @@ class WaveRNN(nn.Module): with torch.no_grad(): if isinstance(mels, np.ndarray): - mels = torch.FloatTensor(mels).to(device) + mels = torch.FloatTensor(mels).type_as(mels) if mels.ndim == 2: mels = mels.unsqueeze(0) - wave_len = (mels.size(-1) - 1) * self.hop_length + wave_len = (mels.size(-1) - 1) * self.config.audio.hop_length - mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side="both") + mels = self.pad_tensor(mels.transpose(1, 2), pad=self.args.pad, side="both") mels, aux = self.upsample(mels.transpose(1, 2)) if batched: @@ -275,11 +338,11 @@ class WaveRNN(nn.Module): b_size, seq_len, _ = mels.size() - h1 = torch.zeros(b_size, self.rnn_dims).to(device) - h2 = torch.zeros(b_size, self.rnn_dims).to(device) - x = torch.zeros(b_size, 1).to(device) + h1 = torch.zeros(b_size, self.args.rnn_dims).type_as(mels) + h2 = torch.zeros(b_size, self.args.rnn_dims).type_as(mels) + x = torch.zeros(b_size, 1).type_as(mels) - if self.use_aux_net: + if self.args.use_aux_net: d = self.aux_dims aux_split = [aux[:, :, d * i : d * (i + 1)] for i in range(4)] @@ -287,35 +350,35 @@ class WaveRNN(nn.Module): m_t = mels[:, i, :] - if self.use_aux_net: + if self.args.use_aux_net: a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split) - x = torch.cat([x, m_t, a1_t], dim=1) if self.use_aux_net else torch.cat([x, m_t], dim=1) + x = torch.cat([x, m_t, a1_t], dim=1) if self.args.use_aux_net else torch.cat([x, m_t], dim=1) x = self.I(x) h1 = rnn1(x, h1) x = x + h1 - inp = torch.cat([x, a2_t], dim=1) if self.use_aux_net else x + inp = torch.cat([x, a2_t], dim=1) if self.args.use_aux_net else x h2 = rnn2(inp, h2) x = x + h2 - x = torch.cat([x, a3_t], dim=1) if self.use_aux_net else x + x = torch.cat([x, a3_t], dim=1) if self.args.use_aux_net else x x = F.relu(self.fc1(x)) - x = torch.cat([x, a4_t], dim=1) if self.use_aux_net else x + x = torch.cat([x, a4_t], dim=1) if self.args.use_aux_net else x x = F.relu(self.fc2(x)) logits = self.fc3(x) - if self.mode == "mold": + if self.args.mode == "mold": sample = sample_from_discretized_mix_logistic(logits.unsqueeze(0).transpose(1, 2)) output.append(sample.view(-1)) - x = sample.transpose(0, 1).to(device) - elif self.mode == "gauss": + x = sample.transpose(0, 1).type_as(mels) + elif self.args.mode == "gauss": sample = sample_from_gaussian(logits.unsqueeze(0).transpose(1, 2)) output.append(sample.view(-1)) - x = sample.transpose(0, 1).to(device) - elif isinstance(self.mode, int): + x = sample.transpose(0, 1).type_as(mels) + elif isinstance(self.args.mode, int): posterior = F.softmax(logits, dim=1) distrib = torch.distributions.Categorical(posterior) @@ -323,7 +386,7 @@ class WaveRNN(nn.Module): output.append(sample) x = sample.unsqueeze(-1) else: - raise RuntimeError("Unknown model mode value - ", self.mode) + raise RuntimeError("Unknown model mode value - ", self.args.mode) if i % 100 == 0: self.gen_display(i, seq_len, b_size, start) @@ -338,22 +401,22 @@ class WaveRNN(nn.Module): else: output = output[0] - if self.mulaw and isinstance(self.mode, int): - output = ap.mulaw_decode(output, self.mode) + if self.args.mulaw and isinstance(self.args.mode, int): + output = AudioProcessor.mulaw_decode(output, self.args.mode) # Fade-out at the end to avoid signal cutting out suddenly - fade_out = np.linspace(1, 0, 20 * self.hop_length) + fade_out = np.linspace(1, 0, 20 * self.config.audio.hop_length) output = output[:wave_len] if wave_len > len(fade_out): - output[-20 * self.hop_length :] *= fade_out + output[-20 * self.config.audio.hop_length :] *= fade_out self.train() return output def gen_display(self, i, seq_len, b_size, start): gen_rate = (i + 1) / (time.time() - start) * b_size / 1000 - realtime_ratio = gen_rate * 1000 / self.sample_rate + realtime_ratio = gen_rate * 1000 / self.config.audio.sample_rate stream( "%i/%i -- batch_size: %i -- gen_rate: %.1f kHz -- x_realtime: %.1f ", (i * b_size, seq_len * b_size, b_size, gen_rate, realtime_ratio), @@ -487,3 +550,83 @@ class WaveRNN(nn.Module): if eval: self.eval() assert not self.training + + def train_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: + mels = batch["input"] + waveform = batch["waveform"] + waveform_coarse = batch["waveform_coarse"] + + y_hat = self.forward(waveform, mels) + if isinstance(self.args.mode, int): + y_hat = y_hat.transpose(1, 2).unsqueeze(-1) + else: + waveform_coarse = waveform_coarse.float() + waveform_coarse = waveform_coarse.unsqueeze(-1) + # compute losses + loss_dict = criterion(y_hat, waveform_coarse) + return {"model_output": y_hat}, loss_dict + + def eval_step(self, batch: Dict, criterion: Dict) -> Tuple[Dict, Dict]: + return self.train_step(batch, criterion) + + @torch.no_grad() + def test_run( + self, ap: AudioProcessor, samples: List[Dict], output: Dict # pylint: disable=unused-argument + ) -> Tuple[Dict, Dict]: + figures = {} + audios = {} + for idx, sample in enumerate(samples): + x = sample["input"] + y_hat = self.inference(x, self.config.batched, self.config.target_samples, self.config.overlap_samples) + x_hat = ap.melspectrogram(y_hat) + figures.update( + { + f"test_{idx}/ground_truth": plot_spectrogram(x.T), + f"test_{idx}/prediction": plot_spectrogram(x_hat.T), + } + ) + audios.update({f"test_{idx}/audio", y_hat}) + return figures, audios + + @staticmethod + def format_batch(batch: Dict) -> Dict: + waveform = batch[0] + mels = batch[1] + waveform_coarse = batch[2] + return {"input": mels, "waveform": waveform, "waveform_coarse": waveform_coarse} + + def get_data_loader( # pylint: disable=no-self-use + self, + config: Coqpit, + ap: AudioProcessor, + is_eval: True, + data_items: List, + verbose: bool, + num_gpus: int, + ): + dataset = WaveRNNDataset( + ap=ap, + items=data_items, + seq_len=config.seq_len, + hop_len=ap.hop_length, + pad=config.model_params.pad, + mode=config.model_params.mode, + mulaw=config.model_params.mulaw, + is_training=not is_eval, + verbose=verbose, + ) + sampler = DistributedSampler(dataset, shuffle=True) if num_gpus > 1 else None + loader = DataLoader( + dataset, + batch_size=1 if is_eval else config.batch_size, + shuffle=num_gpus == 0, + collate_fn=dataset.collate, + sampler=sampler, + num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers, + pin_memory=True, + ) + return loader + + def get_criterion(self): + # define train functions + return WaveRNNLoss(self.args.mode) diff --git a/TTS/vocoder/tf/layers/pqmf.py b/TTS/vocoder/tf/layers/pqmf.py index 81b666b9..042f2f08 100644 --- a/TTS/vocoder/tf/layers/pqmf.py +++ b/TTS/vocoder/tf/layers/pqmf.py @@ -34,7 +34,7 @@ class PQMF(tf.keras.layers.Layer): def analysis(self, x): """ - x : B x 1 x T + x : :math:`[B, 1, T]` """ x = tf.transpose(x, perm=[0, 2, 1]) x = tf.pad(x, [[0, 0], [self.taps // 2, self.taps // 2], [0, 0]], constant_values=0.0) diff --git a/TTS/vocoder/tf/models/melgan_generator.py b/TTS/vocoder/tf/models/melgan_generator.py index 205a240e..09ee9530 100644 --- a/TTS/vocoder/tf/models/melgan_generator.py +++ b/TTS/vocoder/tf/models/melgan_generator.py @@ -92,7 +92,7 @@ class MelganGenerator(tf.keras.models.Model): @tf.function(experimental_relax_shapes=True) def call(self, c, training=False): """ - c : B x C x T + c : :math:`[B, C, T]` """ if training: raise NotImplementedError() diff --git a/TTS/vocoder/utils/distribution.py b/TTS/vocoder/utils/distribution.py index 5c2742c8..fe706ba9 100644 --- a/TTS/vocoder/utils/distribution.py +++ b/TTS/vocoder/utils/distribution.py @@ -113,7 +113,7 @@ def sample_from_discretized_mix_logistic(y, log_scale_min=None): """ Sample from discretized mixture of logistic distributions Args: - y (Tensor): B x C x T + y (Tensor): :math:`[B, C, T]` log_scale_min (float): Log scale minimum value Returns: Tensor: sample in range of [-1, 1]. @@ -149,8 +149,6 @@ def sample_from_discretized_mix_logistic(y, log_scale_min=None): def to_one_hot(tensor, n, fill_with=1.0): # we perform one hot encore with respect to the last axis - one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_() - if tensor.is_cuda: - one_hot = one_hot.cuda() + one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_().type_as(tensor) one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with) return one_hot diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index cb45feb0..eeabbea5 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -1,6 +1,3 @@ -import importlib -import re - import numpy as np import torch from matplotlib import pyplot as plt @@ -29,7 +26,7 @@ def interpolate_vocoder_input(scale_factor, spec): return spec -def plot_results(y_hat, y, ap, global_step, name_prefix): +def plot_results(y_hat, y, ap, name_prefix): """Plot vocoder model results""" # select an instance from batch @@ -47,7 +44,7 @@ def plot_results(y_hat, y, ap, global_step, name_prefix): plt.title("groundtruth speech") plt.subplot(2, 1, 2) plt.plot(y_hat) - plt.title(f"generated speech @ {global_step} steps") + plt.title("generated speech") plt.tight_layout() plt.close() @@ -58,162 +55,3 @@ def plot_results(y_hat, y, ap, global_step, name_prefix): name_prefix + "speech_comparison": fig_wave, } return figures - - -def to_camel(text): - text = text.capitalize() - return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) - - -def setup_generator(c): - print(" > Generator Model: {}".format(c.generator_model)) - MyModel = importlib.import_module("TTS.vocoder.models." + c.generator_model.lower()) - # this is to preserve the WaveRNN class name (instead of Wavernn) - if c.generator_model.lower() == "wavernn": - MyModel = getattr(MyModel, "WaveRNN") - else: - MyModel = getattr(MyModel, to_camel(c.generator_model)) - if c.generator_model.lower() in "wavernn": - model = MyModel( - rnn_dims=c.wavernn_model_params["rnn_dims"], - fc_dims=c.wavernn_model_params["fc_dims"], - mode=c.mode, - mulaw=c.mulaw, - pad=c.padding, - use_aux_net=c.wavernn_model_params["use_aux_net"], - use_upsample_net=c.wavernn_model_params["use_upsample_net"], - upsample_factors=c.wavernn_model_params["upsample_factors"], - feat_dims=c.audio["num_mels"], - compute_dims=c.wavernn_model_params["compute_dims"], - res_out_dims=c.wavernn_model_params["res_out_dims"], - num_res_blocks=c.wavernn_model_params["num_res_blocks"], - hop_length=c.audio["hop_length"], - sample_rate=c.audio["sample_rate"], - ) - elif c.generator_model.lower() in "hifigan_generator": - model = MyModel(in_channels=c.audio["num_mels"], out_channels=1, **c.generator_model_params) - elif c.generator_model.lower() in "melgan_generator": - model = MyModel( - in_channels=c.audio["num_mels"], - out_channels=1, - proj_kernel=7, - base_channels=512, - upsample_factors=c.generator_model_params["upsample_factors"], - res_kernel=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - ) - elif c.generator_model in "melgan_fb_generator": - raise ValueError("melgan_fb_generator is now fullband_melgan_generator") - elif c.generator_model.lower() in "multiband_melgan_generator": - model = MyModel( - in_channels=c.audio["num_mels"], - out_channels=4, - proj_kernel=7, - base_channels=384, - upsample_factors=c.generator_model_params["upsample_factors"], - res_kernel=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - ) - elif c.generator_model.lower() in "fullband_melgan_generator": - model = MyModel( - in_channels=c.audio["num_mels"], - out_channels=1, - proj_kernel=7, - base_channels=512, - upsample_factors=c.generator_model_params["upsample_factors"], - res_kernel=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - ) - elif c.generator_model.lower() in "parallel_wavegan_generator": - model = MyModel( - in_channels=1, - out_channels=1, - kernel_size=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - stacks=c.generator_model_params["stacks"], - res_channels=64, - gate_channels=128, - skip_channels=64, - aux_channels=c.audio["num_mels"], - dropout=0.0, - bias=True, - use_weight_norm=True, - upsample_factors=c.generator_model_params["upsample_factors"], - ) - elif c.generator_model.lower() in "wavegrad": - model = MyModel( - in_channels=c["audio"]["num_mels"], - out_channels=1, - use_weight_norm=c["model_params"]["use_weight_norm"], - x_conv_channels=c["model_params"]["x_conv_channels"], - y_conv_channels=c["model_params"]["y_conv_channels"], - dblock_out_channels=c["model_params"]["dblock_out_channels"], - ublock_out_channels=c["model_params"]["ublock_out_channels"], - upsample_factors=c["model_params"]["upsample_factors"], - upsample_dilations=c["model_params"]["upsample_dilations"], - ) - else: - raise NotImplementedError(f"Model {c.generator_model} not implemented!") - return model - - -def setup_discriminator(c): - print(" > Discriminator Model: {}".format(c.discriminator_model)) - if "parallel_wavegan" in c.discriminator_model: - MyModel = importlib.import_module("TTS.vocoder.models.parallel_wavegan_discriminator") - else: - MyModel = importlib.import_module("TTS.vocoder.models." + c.discriminator_model.lower()) - MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower())) - if c.discriminator_model in "hifigan_discriminator": - model = MyModel() - if c.discriminator_model in "random_window_discriminator": - model = MyModel( - cond_channels=c.audio["num_mels"], - hop_length=c.audio["hop_length"], - uncond_disc_donwsample_factors=c.discriminator_model_params["uncond_disc_donwsample_factors"], - cond_disc_downsample_factors=c.discriminator_model_params["cond_disc_downsample_factors"], - cond_disc_out_channels=c.discriminator_model_params["cond_disc_out_channels"], - window_sizes=c.discriminator_model_params["window_sizes"], - ) - if c.discriminator_model in "melgan_multiscale_discriminator": - model = MyModel( - in_channels=1, - out_channels=1, - kernel_sizes=(5, 3), - base_channels=c.discriminator_model_params["base_channels"], - max_channels=c.discriminator_model_params["max_channels"], - downsample_factors=c.discriminator_model_params["downsample_factors"], - ) - if c.discriminator_model == "residual_parallel_wavegan_discriminator": - model = MyModel( - in_channels=1, - out_channels=1, - kernel_size=3, - num_layers=c.discriminator_model_params["num_layers"], - stacks=c.discriminator_model_params["stacks"], - res_channels=64, - gate_channels=128, - skip_channels=64, - dropout=0.0, - bias=True, - nonlinear_activation="LeakyReLU", - nonlinear_activation_params={"negative_slope": 0.2}, - ) - if c.discriminator_model == "parallel_wavegan_discriminator": - model = MyModel( - in_channels=1, - out_channels=1, - kernel_size=3, - num_layers=c.discriminator_model_params["num_layers"], - conv_channels=64, - dilation_factor=1, - nonlinear_activation="LeakyReLU", - nonlinear_activation_params={"negative_slope": 0.2}, - bias=True, - ) - return model - - -# def check_config(c): -# c = None -# pass diff --git a/TTS/vocoder/utils/io.py b/TTS/vocoder/utils/io.py deleted file mode 100644 index 9c67535f..00000000 --- a/TTS/vocoder/utils/io.py +++ /dev/null @@ -1,128 +0,0 @@ -import datetime -import glob -import os -import pickle as pickle_tts - -import torch - -from TTS.utils.io import RenamingUnpickler - - -def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False): # pylint: disable=redefined-builtin - try: - state = torch.load(checkpoint_path, map_location=torch.device("cpu")) - except ModuleNotFoundError: - pickle_tts.Unpickler = RenamingUnpickler - state = torch.load(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts) - model.load_state_dict(state["model"]) - if use_cuda: - model.cuda() - if eval: - model.eval() - return model, state - - -def save_model( - model, optimizer, scheduler, model_disc, optimizer_disc, scheduler_disc, current_step, epoch, output_path, **kwargs -): - if hasattr(model, "module"): - model_state = model.module.state_dict() - else: - model_state = model.state_dict() - model_disc_state = model_disc.state_dict() if model_disc is not None else None - optimizer_state = optimizer.state_dict() if optimizer is not None else None - optimizer_disc_state = optimizer_disc.state_dict() if optimizer_disc is not None else None - scheduler_state = scheduler.state_dict() if scheduler is not None else None - scheduler_disc_state = scheduler_disc.state_dict() if scheduler_disc is not None else None - state = { - "model": model_state, - "optimizer": optimizer_state, - "scheduler": scheduler_state, - "model_disc": model_disc_state, - "optimizer_disc": optimizer_disc_state, - "scheduler_disc": scheduler_disc_state, - "step": current_step, - "epoch": epoch, - "date": datetime.date.today().strftime("%B %d, %Y"), - } - state.update(kwargs) - torch.save(state, output_path) - - -def save_checkpoint( - model, - optimizer, - scheduler, - model_disc, - optimizer_disc, - scheduler_disc, - current_step, - epoch, - output_folder, - **kwargs, -): - file_name = "checkpoint_{}.pth.tar".format(current_step) - checkpoint_path = os.path.join(output_folder, file_name) - print(" > CHECKPOINT : {}".format(checkpoint_path)) - save_model( - model, - optimizer, - scheduler, - model_disc, - optimizer_disc, - scheduler_disc, - current_step, - epoch, - checkpoint_path, - **kwargs, - ) - - -def save_best_model( - current_loss, - best_loss, - model, - optimizer, - scheduler, - model_disc, - optimizer_disc, - scheduler_disc, - current_step, - epoch, - out_path, - keep_all_best=False, - keep_after=10000, - **kwargs, -): - if current_loss < best_loss: - best_model_name = f"best_model_{current_step}.pth.tar" - checkpoint_path = os.path.join(out_path, best_model_name) - print(" > BEST MODEL : {}".format(checkpoint_path)) - save_model( - model, - optimizer, - scheduler, - model_disc, - optimizer_disc, - scheduler_disc, - current_step, - epoch, - checkpoint_path, - model_loss=current_loss, - **kwargs, - ) - # only delete previous if current is saved successfully - if not keep_all_best or (current_step < keep_after): - model_names = glob.glob(os.path.join(out_path, "best_model*.pth.tar")) - for model_name in model_names: - if os.path.basename(model_name) == best_model_name: - continue - os.remove(model_name) - # create symlink to best model for convinience - link_name = "best_model.pth.tar" - link_path = os.path.join(out_path, link_name) - if os.path.islink(link_path) or os.path.isfile(link_path): - os.remove(link_path) - os.symlink(best_model_name, os.path.join(out_path, link_name)) - best_loss = current_loss - return best_loss diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..b1d20a99 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= -j auto -WT --keep-going +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..e69de29b diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..73abe83f --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,5 @@ +furo +myst-parser == 0.15.1 +sphinx == 4.0.2 +sphinx_inline_tabs +sphinx_copybutton \ No newline at end of file diff --git a/docs/source/_static/logo.png b/docs/source/_static/logo.png new file mode 100644 index 00000000..6a1185c0 Binary files /dev/null and b/docs/source/_static/logo.png differ diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 00000000..5831fcdb --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,121 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys + +sys.path.insert(0, os.path.abspath('../..')) + +# mock deps with system level requirements. +autodoc_mock_imports = ["soundfile"] + +# -- Project information ----------------------------------------------------- +project = 'TTS' +copyright = "2021 Coqui GmbH, 2020 TTS authors" +author = 'Coqui GmbH' + +with open("../../TTS/VERSION", "r") as ver: + version = ver.read().strip() + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +release = version + +# The main toctree document. +master_doc = "index" + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'TODO/*'] + +source_suffix = [".rst", ".md"] + +# extensions +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.doctest', + 'sphinx.ext.intersphinx', + 'sphinx.ext.todo', + 'sphinx.ext.coverage', + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', + 'sphinx.ext.autosectionlabel', + 'myst_parser', + "sphinx_copybutton", + "sphinx_inline_tabs", +] + +# 'sphinxcontrib.katex', +# 'sphinx.ext.autosectionlabel', + + +# autosectionlabel throws warnings if section names are duplicated. +# The following tells autosectionlabel to not throw a warning for +# duplicated section names that are in different documents. +autosectionlabel_prefix_document = True + +language = None + +autodoc_inherit_docstrings = False + +# Disable displaying type annotations, these can be very verbose +autodoc_typehints = 'none' + +# Enable overriding of function signatures in the first line of the docstring. +autodoc_docstring_signature = True + +napoleon_custom_sections = [('Shapes', 'shape')] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'furo' +html_tite = "TTS" +html_theme_options = { + "light_logo": "logo.png", + "dark_logo": "logo.png", + "sidebar_hide_name": True, +} + +html_sidebars = { + '**': [ + "sidebar/scroll-start.html", + "sidebar/brand.html", + "sidebar/search.html", + "sidebar/navigation.html", + "sidebar/ethical-ads.html", + "sidebar/scroll-end.html", + ] + } + + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] diff --git a/docs/source/configuration.md b/docs/source/configuration.md new file mode 100644 index 00000000..cde7e073 --- /dev/null +++ b/docs/source/configuration.md @@ -0,0 +1,59 @@ +# Configuration + +We use 👩‍✈️[Coqpit] for configuration management. It provides basic static type checking and serialization capabilities on top of native Python `dataclasses`. Here is how a simple configuration looks like with Coqpit. + +```python +from dataclasses import asdict, dataclass, field +from typing import List, Union +from coqpit.coqpit import MISSING, Coqpit, check_argument + + +@dataclass +class SimpleConfig(Coqpit): + val_a: int = 10 + val_b: int = None + val_d: float = 10.21 + val_c: str = "Coqpit is great!" + vol_e: bool = True + # mandatory field + # raise an error when accessing the value if it is not changed. It is a way to define + val_k: int = MISSING + # optional field + val_dict: dict = field(default_factory=lambda: {"val_aa": 10, "val_ss": "This is in a dict."}) + # list of list + val_listoflist: List[List] = field(default_factory=lambda: [[1, 2], [3, 4]]) + val_listofunion: List[List[Union[str, int, bool]]] = field( + default_factory=lambda: [[1, 3], [1, "Hi!"], [True, False]] + ) + + def check_values( + self, + ): # you can define explicit constraints manually or by`check_argument()` + """Check config fields""" + c = asdict(self) # avoid unexpected changes on `self` + check_argument("val_a", c, restricted=True, min_val=10, max_val=2056) + check_argument("val_b", c, restricted=True, min_val=128, max_val=4058, allow_none=True) + check_argument("val_c", c, restricted=True) +``` + +In TTS, each model must have a configuration class that exposes all the values necessary for its lifetime. + +It defines model architecture, hyper-parameters, training, and inference settings. For our models, we merge all the fields in a single configuration class for ease. It may not look like a wise practice but enables easier bookkeeping and reproducible experiments. + +The general configuration hierarchy looks like below: + +``` +ModelConfig() + | + | -> ... # model specific configurations + | -> ModelArgs() # model class arguments + | -> BaseDatasetConfig() # only for tts models + | -> BaseXModelConfig() # Generic fields for `tts` and `vocoder` models. + | + | -> BaseTrainingConfig() # trainer fields + | -> BaseAudioConfig() # audio processing fields +``` + +In the example above, ```ModelConfig()``` is the final configuration that the model receives and it has all the fields necessary for the model. + +We host pre-defined model configurations under ```TTS//configs/```.Although we recommend a unified config class, you can decompose it as you like as for your custom models as long as all the fields for the trainer, model, and inference APIs are provided. \ No newline at end of file diff --git a/docs/source/contributing.md b/docs/source/contributing.md new file mode 100644 index 00000000..5b272509 --- /dev/null +++ b/docs/source/contributing.md @@ -0,0 +1,3 @@ +```{include} ../../CONTRIBUTING.md +:relative-images: +``` diff --git a/docs/source/converting_torch_to_tf.md b/docs/source/converting_torch_to_tf.md new file mode 100644 index 00000000..20a0be6b --- /dev/null +++ b/docs/source/converting_torch_to_tf.md @@ -0,0 +1,21 @@ +# Converting Torch to TF 2 + +Currently, 🐸TTS supports the vanilla Tacotron2 and MelGAN models in TF 2.It does not support advanced attention methods and other small tricks used by the Torch models. You can convert any Torch model trained after v0.0.2. + +You can also export TF 2 models to TFLite for even faster inference. + +## How to convert from Torch to TF 2.0 +Make sure you installed Tensorflow v2.2. It is not installed by default by :frog: TTS. + +All the TF related code stays under ```tf``` folder. + +To convert a **compatible** Torch model, run the following command with the right arguments: + +```bash +python TTS/bin/convert_tacotron2_torch_to_tf.py\ + --torch_model_path /path/to/torch/model.pth.tar \ + --config_path /path/to/model/config.json\ + --output_path /path/to/output/tf/model +``` + +This will create a TF model file. Notice that our model format is not compatible with the official TF checkpoints. We created our custom format to match Torch checkpoints we use. Therefore, use the ```load_checkpoint``` and ```save_checkpoint``` functions provided under ```TTS.tf.generic_utils```. diff --git a/docs/source/faq.md b/docs/source/faq.md new file mode 100644 index 00000000..4dbaab13 --- /dev/null +++ b/docs/source/faq.md @@ -0,0 +1,114 @@ +# Humble FAQ +We tried to collect common issues and questions we receive about 🐸TTS. It is worth checking before going deeper. + +## Errors with a pre-trained model. How can I resolve this? +- Make sure you use the right commit version of 🐸TTS. Each pre-trained model has its corresponding version that needs to be used. It is defined on the model table. +- If it is still problematic, post your problem on [Discussions](https://github.com/coqui-ai/TTS/discussions). Please give as many details as possible (error message, your TTS version, your TTS model and config.json etc.) +- If you feel like it's a bug to be fixed, then prefer Github issues with the same level of scrutiny. + +## What are the requirements of a good 🐸TTS dataset? +* https://github.com/coqui-ai/TTS/wiki/What-makes-a-good-TTS-dataset + +## How should I choose the right model? +- First, train Tacotron. It is smaller and faster to experiment with. If it performs poorly, try Tacotron2. +- Tacotron models produce the most natural voice if your dataset is not too noisy. +- If both models do not perform well and especially the attention does not align, then try AlignTTS or GlowTTS. +- If you need faster models, consider SpeedySpeech, GlowTTS or AlignTTS. Keep in mind that SpeedySpeech requires a pre-trained Tacotron or Tacotron2 model to compute text-to-speech alignments. + +## How can I train my own `tts` model? +0. Check your dataset with notebooks in [dataset_analysis](https://github.com/coqui-ai/TTS/tree/master/notebooks/dataset_analysis) folder. Use [this notebook](https://github.com/coqui-ai/TTS/blob/master/notebooks/dataset_analysis/CheckSpectrograms.ipynb) to find the right audio processing parameters. A better set of parameters results in a better audio synthesis. + +1. Write your own dataset `formatter` in `datasets/formatters.py` or format your dataset as one of the supported datasets, like LJSpeech. + A `formatter` parses the metadata file and converts a list of training samples. + +2. If you have a dataset with a different alphabet than English, you need to set your own character list in the ```config.json```. + - If you use phonemes for training and your language is supported [here](https://github.com/rhasspy/gruut#supported-languages), you don't need to set your character list. + - You can use `TTS/bin/find_unique_chars.py` to get characters used in your dataset. + +3. Write your own text cleaner in ```utils.text.cleaners```. It is not always necessary, except when you have a different alphabet or language-specific requirements. + - A `cleaner` performs number and abbreviation expansion and text normalization. Basically, it converts the written text to its spoken format. + - If you go lazy, you can try using ```basic_cleaners```. + +4. Fill in a ```config.json```. Go over each parameter one by one and consider it regarding the appended explanation. + - Check the `Coqpit` class created for your target model. Coqpit classes for `tts` models are under `TTS/tts/configs/`. + - You just need to define fields you need/want to change in your `config.json`. For the rest, their default values are used. + - 'sample_rate', 'phoneme_language' (if phoneme enabled), 'output_path', 'datasets', 'text_cleaner' are the fields you need to edit in most of the cases. + - Here is a sample `config.json` for training a `GlowTTS` network. + ```json + { + "model": "glow_tts", + "batch_size": 32, + "eval_batch_size": 16, + "num_loader_workers": 4, + "num_eval_loader_workers": 4, + "run_eval": true, + "test_delay_epochs": -1, + "epochs": 1000, + "text_cleaner": "english_cleaners", + "use_phonemes": false, + "phoneme_language": "en-us", + "phoneme_cache_path": "phoneme_cache", + "print_step": 25, + "print_eval": true, + "mixed_precision": false, + "output_path": "recipes/ljspeech/glow_tts/", + "test_sentences": ["Test this sentence.", "This test sentence.", "Sentence this test."], + "datasets":[{"name": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}] + } + ``` + +6. Train your model. + - SingleGPU training: ```CUDA_VISIBLE_DEVICES="0" python train_tts.py --config_path config.json``` + - MultiGPU training: ```CUDA_VISIBLE_DEVICES="0,1,2" python distribute.py --script train_tts.py --config_path config.json``` + - This command uses all the GPUs given in ```CUDA_VISIBLE_DEVICES```. If you don't specify, it uses all the GPUs available. + +**Note:** You can also train your model using pure 🐍 python. Check ```{eval-rst} :ref: 'tutorial_for_nervous_beginners'```. + +## How can I train in a different language? +- Check steps 2, 3, 4, 5 above. + +## How can I train multi-GPUs? +- Check step 5 above. + +## How can I check model performance? +- You can inspect model training and performance using ```tensorboard```. It will show you loss, attention alignment, model output. Go with the order below to measure the model performance. +1. Check ground truth spectrograms. If they do not look as they are supposed to, then check audio processing parameters in ```config.json```. +2. Check train and eval losses and make sure that they all decrease smoothly in time. +3. Check model spectrograms. Especially, training outputs should look similar to ground truth spectrograms after ~10K iterations. +4. Your model would not work well at test time until the attention has a near diagonal alignment. This is the sublime art of TTS training. + - Attention should converge diagonally after ~50K iterations. + - If attention does not converge, the probabilities are; + - Your dataset is too noisy or small. + - Samples are too long. + - Batch size is too small (batch_size < 32 would be having a hard time converging) + - You can also try other attention algorithms like 'graves', 'bidirectional_decoder', 'forward_attn'. + - 'bidirectional_decoder' is your ultimate savior, but it trains 2x slower and demands 1.5x more GPU memory. + - You can also try the other models like AlignTTS or GlowTTS. + +## How do I know when to stop training? +There is no single objective metric to decide the end of a training since the voice quality is a subjective matter. + +In our model trainings, we follow these steps; + +- Check test time audio outputs, if it does not improve more. +- Check test time attention maps, if they look clear and diagonal. +- Check validation loss, if it converged and smoothly went down or started to overfit going up. +- If the answer is YES for all of the above, then test the model with a set of complex sentences. For English, you can use the `TestAttention` notebook. + +Keep in mind that the approach above only validates the model robustness. It is hard to estimate the voice quality without asking the actual people. +The best approach is to pick a set of promising models and run a Mean-Opinion-Score study asking actual people to score the models. + +## My model does not learn. How can I debug? +- Go over the steps under "How can I check model performance?" + +## Attention does not align. How can I make it work? +- Check the 4th step under "How can I check model performance?" + +## How can I test a trained model? +- The best way is to use `tts` or `tts-server` commands. For details check {ref}`here `. +- If you need to code your own ```TTS.utils.synthesizer.Synthesizer``` class. + +## My Tacotron model does not stop - I see "Decoder stopped with 'max_decoder_steps" - Stopnet does not work. +- In general, all of the above relates to the `stopnet`. It is the part of the model telling the `decoder` when to stop. +- In general, a poor `stopnet` relates to something else that is broken in your model or dataset. Especially the attention module. +- One common reason is the silent parts in the audio clips at the beginning and the ending. Check ```trim_db``` value in the config. You can find a better value for your dataset by using ```CheckSpectrogram``` notebook. If this value is too small, too much of the audio will be trimmed. If too big, then too much silence will remain. Both will curtail the `stopnet` performance. \ No newline at end of file diff --git a/docs/source/formatting_your_dataset.md b/docs/source/formatting_your_dataset.md new file mode 100644 index 00000000..cc0e456a --- /dev/null +++ b/docs/source/formatting_your_dataset.md @@ -0,0 +1,82 @@ +# Formatting Your Dataset + +For training a TTS model, you need a dataset with speech recordings and transcriptions. The speech must be divided into audio clips and each clip needs transcription. + +If you have a single audio file and you need to split it into clips, there are different open-source tools for you. We recommend Audacity. It is an open-source and free audio editing software. + +It is also important to use a lossless audio file format to prevent compression artifacts. We recommend using `wav` file format. + +Let's assume you created the audio clips and their transcription. You can collect all your clips under a folder. Let's call this folder `wavs`. + +``` +/wavs + | - audio1.wav + | - audio2.wav + | - audio3.wav + ... +``` + +You can either create separate transcription files for each clip or create a text file that maps each audio clip to its transcription. In this file, each line must be delimitered by a special character separating the audio file name from the transcription. And make sure that the delimiter is not used in the transcription text. + +We recommend the following format delimited by `|`. + +``` +# metadata.txt + +audio1.wav | This is my sentence. +audio2.wav | This is maybe my sentence. +audio3.wav | This is certainly my sentence. +audio4.wav | Let this be your sentence. +... +``` + +In the end, we have the following folder structure +``` +/MyTTSDataset + | + | -> metadata.txt + | -> /wavs + | -> audio1.wav + | -> audio2.wav + | ... +``` + +The format above is taken from widely-used the [LJSpeech](https://keithito.com/LJ-Speech-Dataset/) dataset. You can also download and see the dataset. 🐸TTS already provides tooling for the LJSpeech. if you use the same format, you can start training your models right away. + +## Dataset Quality + +Your dataset should have good coverage of the target language. It should cover the phonemic variety, exceptional sounds and syllables. This is extremely important for especially non-phonemic languages like English. + +For more info about dataset qualities and properties check our [post](https://github.com/coqui-ai/TTS/wiki/What-makes-a-good-TTS-dataset). + +## Using Your Dataset in 🐸TTS + +After you collect and format your dataset, you need to check two things. Whether you need a `formatter` and a `text_cleaner`. The `formatter` loads the text file (created above) as a list and the `text_cleaner` performs a sequence of text normalization operations that converts the raw text into the spoken representation (e.g. converting numbers to text, acronyms, and symbols to the spoken format). + +If you use a different dataset format then the LJSpeech or the other public datasets that 🐸TTS supports, then you need to write your own `formatter`. + +If your dataset is in a new language or it needs special normalization steps, then you need a new `text_cleaner`. + +What you get out of a `formatter` is a `List[List[]]` in the following format. + +``` +>>> formatter(metafile_path) +[["audio1.wav", "This is my sentence.", "MyDataset"], +["audio1.wav", "This is maybe a sentence.", "MyDataset"], +... +] +``` + +Each sub-list is parsed as ```["", "", "]```. +`````` is the dataset name for single speaker datasets and it is mainly used +in the multi-speaker models to map the speaker of the each sample. But for now, we only focus on single speaker datasets. + +The purpose of a `formatter` is to parse your metafile and load the audio file paths and transcriptions. Then, its output passes to a `Dataset` object. It computes features from the audio signals, calls text normalization routines, and converts raw text to +phonemes if needed. + +See `TTS.tts.datasets.TTSDataset`, a generic `Dataset` implementation for the `tts` models. + +See `TTS.vocoder.datasets.*`, for different `Dataset` implementations for the `vocoder` models. + +See `TTS.utils.audio.AudioProcessor` that includes all the audio processing and feature extraction functions used in a +`Dataset` implementation. Feel free to add things as you need.passed \ No newline at end of file diff --git a/docs/source/implementing_a_new_model.md b/docs/source/implementing_a_new_model.md new file mode 100644 index 00000000..c0043bf1 --- /dev/null +++ b/docs/source/implementing_a_new_model.md @@ -0,0 +1,61 @@ +# Implementing a Model + +1. Implement layers. + + You can either implement the layers under `TTS/tts/layers/new_model.py` or in the model file `TTS/tts/model/new_model.py`. + You can also reuse layers already implemented. + +2. Test layers. + + We keep tests under `tests` folder. You can add `tts` layers tests under `tts_tests` folder. + Basic tests are checking input-output tensor shapes and output values for a given input. Consider testing extreme cases that are more likely to cause problems like `zero` tensors. + +3. Implement loss function. + + We keep loss functions under `TTS/tts/layers/losses.py`. You can also mix-and-match implemented loss functions as you like. + + A loss function returns a dictionary in a format ```{’loss’: loss, ‘loss1’:loss1 ...}``` and the dictionary must at least define the `loss` key which is the actual value used by the optimizer. All the items in the dictionary are automatically logged on the terminal and the Tensorboard. + +4. Test the loss function. + + As we do for the layers, you need to test the loss functions too. You need to check input/output tensor shapes, + expected output values for a given input tensor. For instance, certain loss functions have upper and lower limits and + it is a wise practice to test with the inputs that should produce these limits. + +5. Implement `MyModel`. + + In 🐸TTS, a model class is a self-sufficient implementation of a model directing all the interactions with the other + components. It is enough to implement the API provided by the `BaseModel` class to comply. + + A model interacts with the `Trainer API` for training, `Synthesizer API` for inference and testing. + + A 🐸TTS model must return a dictionary by the `forward()` and `inference()` functions. This dictionary must also include the `model_outputs` key that is considered as the main model output by the `Trainer` and `Synthesizer`. + + You can place your `tts` model implementation under `TTS/tts/models/new_model.py` then inherit and implement the `BaseTTS`. + + There is also the `callback` interface by which you can manipulate both the model and the `Trainer` states. Callbacks give you + the infinite flexibility to add custom behaviours for your model and training routines. + + For more details, see {ref}`BaseTTS ` and :obj:`TTS.utils.callbacks`. + +6. Optionally, define `MyModelArgs`. + + `MyModelArgs` is a 👨‍✈️Coqpit class that sets all the class arguments of the `MyModel`. It should be enough to pass + an `MyModelArgs` instance to initiate the `MyModel`. + +7. Test `MyModel`. + + As the layers and the loss functions, it is recommended to test your model. One smart way for testing is that you + create two models with the exact same weights. Then we run a training loop with one of these models and + compare the weights with the other model. All the weights need to be different in a passing test. Otherwise, it + is likely that a part of the model is malfunctioning or not even attached to the model's computational graph. + +8. Define `MyModelConfig`. + + Place `MyModelConfig` file under `TTS/models/configs`. It is enough to inherit the `BaseTTSConfig` to make your + config compatible with the `Trainer`. You should also include `MyModelArgs` as a field if defined. The rest of the fields should define the model + specific values and parameters. + +9. Write Docstrings. + + We love you more when you document your code. ❤️ diff --git a/docs/source/index.md b/docs/source/index.md new file mode 100644 index 00000000..001265fa --- /dev/null +++ b/docs/source/index.md @@ -0,0 +1,53 @@ + +```{include} ../../README.md +:relative-images: +``` +---- + +# Documentation Content +```{eval-rst} +.. toctree:: + :maxdepth: 2 + :caption: Get started + + tutorial_for_nervous_beginners + installation + faq + contributing + +.. toctree:: + :maxdepth: 2 + :caption: Using 🐸TTS + + inference + implementing_a_new_model + training_a_model + configuration + formatting_your_dataset + what_makes_a_good_dataset + tts_datasets + converting_torch_to_tf + +.. toctree:: + :maxdepth: 2 + :caption: Main Classes + + main_classes/trainer_api + main_classes/audio_processor + main_classes/model_api + main_classes/dataset + main_classes/gan + main_classes/speaker_manager + +.. toctree:: + :maxdepth: 2 + :caption: `tts` Models + + models/glow_tts.md + +.. toctree:: + :maxdepth: 2 + :caption: `vocoder` Models + +``` + diff --git a/docs/source/inference.md b/docs/source/inference.md new file mode 100644 index 00000000..544473bf --- /dev/null +++ b/docs/source/inference.md @@ -0,0 +1,103 @@ +(synthesizing_speech)= +# Synthesizing Speech + +First, you need to install TTS. We recommend using PyPi. You need to call the command below: + +```bash +$ pip install TTS +``` + +After the installation, 2 terminal commands are available. + +1. TTS Command Line Interface (CLI). - `tts` +2. Local Demo Server. - `tts-server` + +## On the Commandline - `tts` +![cli.gif](https://github.com/coqui-ai/TTS/raw/main/images/tts_cli.gif) + +After the installation, 🐸TTS provides a CLI interface for synthesizing speech using pre-trained models. You can either use your own model or the release models under 🐸TTS. + +Listing released 🐸TTS models. + +```bash +tts --list_models +``` + +Run a TTS model, from the release models list, with its default vocoder. (Simply copy and paste the full model names from the list as arguments for the command below.) + +```bash +tts --text "Text for TTS" \ + --model_name "///" \ + --out_path folder/to/save/output.wav +``` + +Run a tts and a vocoder model from the released model list. Note that not every vocoder is compatible with every TTS model. + +```bash +tts --text "Text for TTS" \ + --model_name "///" \ + --vocoder_name "///" \ + --out_path folder/to/save/output.wav +``` + +Run your own TTS model (Using Griffin-Lim Vocoder) + +```bash +tts --text "Text for TTS" \ + --model_path path/to/model.pth.tar \ + --config_path path/to/config.json \ + --out_path folder/to/save/output.wav +``` + +Run your own TTS and Vocoder models + +```bash +tts --text "Text for TTS" \ + --config_path path/to/config.json \ + --model_path path/to/model.pth.tar \ + --out_path folder/to/save/output.wav \ + --vocoder_path path/to/vocoder.pth.tar \ + --vocoder_config_path path/to/vocoder_config.json +``` + +Run a multi-speaker TTS model from the released models list. + +```bash +tts --model_name "///" --list_speaker_idxs # list the possible speaker IDs. +tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "//" --speaker_idx "" +``` + +**Note:** You can use ```./TTS/bin/synthesize.py``` if you prefer running ```tts``` from the TTS project folder. + +## On the Demo Server - `tts-server` + + +![server.gif](https://github.com/coqui-ai/TTS/raw/main/images/demo_server.gif) + +You can boot up a demo 🐸TTS server to run an inference with your models. Note that the server is not optimized for performance +but gives you an easy way to interact with the models. + +The demo server provides pretty much the same interface as the CLI command. + +```bash +tts-server -h # see the help +tts-server --list_models # list the available models. +``` + +Run a TTS model, from the release models list, with its default vocoder. +If the model you choose is a multi-speaker TTS model, you can select different speakers on the Web interface and synthesize +speech. + +```bash +tts-server --model_name "///" +``` + +Run a TTS and a vocoder model from the released model list. Note that not every vocoder is compatible with every TTS model. + +```bash +tts-server --model_name "///" \ + --vocoder_name "///" +``` + +## TorchHub +You can also use [this simple colab notebook](https://colab.research.google.com/drive/1iAe7ZdxjUIuN6V4ooaCt0fACEGKEn7HW?usp=sharing) using TorchHub to synthesize speech. \ No newline at end of file diff --git a/docs/source/installation.md b/docs/source/installation.md new file mode 100644 index 00000000..6532ee8e --- /dev/null +++ b/docs/source/installation.md @@ -0,0 +1,39 @@ +# Installation + +🐸TTS supports python >=3.6 <=3.9 and tested on Ubuntu 18.10, 19.10, 20.10. + +## Using `pip` + +`pip` is recommended if you want to use 🐸TTS only for inference. + +You can install from PyPI as follows: + +```bash +pip install TTS # from PyPI +``` + +By default, this only installs the requirements for PyTorch. To install the tensorflow dependencies as well, use the `tf` extra. + +```bash +pip install TTS[tf] +``` + +Or install from Github: + +```bash +pip install git+https://github.com/coqui-ai/TTS # from Github +``` + +## Installing From Source + +This is recommended for development and more control over 🐸TTS. + +```bash +git clone https://github.com/coqui-ai/TTS/ +cd TTS +make system-deps # only on Linux systems. +make install +``` + +## On Windows +If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/ \ No newline at end of file diff --git a/docs/source/main_classes/audio_processor.md b/docs/source/main_classes/audio_processor.md new file mode 100644 index 00000000..600b0db5 --- /dev/null +++ b/docs/source/main_classes/audio_processor.md @@ -0,0 +1,25 @@ +# AudioProcessor API + +`TTS.utils.audio.AudioProcessor` is the core class for all the audio processing routines. It provides an API for + +- Feature extraction. +- Sound normalization. +- Reading and writing audio files. +- Sampling audio signals. +- Normalizing and denormalizing audio signals. +- Griffin-Lim vocoder. + +The `AudioProcessor` needs to be initialized with `TTS.config.shared_configs.BaseAudioConfig`. Any model config +also must inherit or initiate `BaseAudioConfig`. + +## AudioProcessor +```{eval-rst} +.. autoclass:: TTS.utils.audio.AudioProcessor + :members: +``` + +## BaseAudioConfig +```{eval-rst} +.. autoclass:: TTS.config.shared_configs.BaseAudioConfig + :members: +``` \ No newline at end of file diff --git a/docs/source/main_classes/dataset.md b/docs/source/main_classes/dataset.md new file mode 100644 index 00000000..92d381ac --- /dev/null +++ b/docs/source/main_classes/dataset.md @@ -0,0 +1,25 @@ +# Datasets + +## TTS Dataset + +```{eval-rst} +.. autoclass:: TTS.tts.datasets.TTSDataset + :members: +``` + +## Vocoder Dataset + +```{eval-rst} +.. autoclass:: TTS.vocoder.datasets.gan_dataset.GANDataset + :members: +``` + +```{eval-rst} +.. autoclass:: TTS.vocoder.datasets.wavegrad_dataset.WaveGradDataset + :members: +``` + +```{eval-rst} +.. autoclass:: TTS.vocoder.datasets.wavernn_dataset.WaveRNNDataset + :members: +``` \ No newline at end of file diff --git a/docs/source/main_classes/gan.md b/docs/source/main_classes/gan.md new file mode 100644 index 00000000..4524b4b5 --- /dev/null +++ b/docs/source/main_classes/gan.md @@ -0,0 +1,12 @@ +# GAN API + +The {class}`TTS.vocoder.models.gan.GAN` provides an easy way to implementing new GAN based models. You just need +to define the model architectures for the generator and the discriminator networks and give them to the `GAN` class +to do its ✨️. + + +## GAN +```{eval-rst} +.. autoclass:: TTS.vocoder.models.gan.GAN + :members: +``` \ No newline at end of file diff --git a/docs/source/main_classes/model_api.md b/docs/source/main_classes/model_api.md new file mode 100644 index 00000000..6781a268 --- /dev/null +++ b/docs/source/main_classes/model_api.md @@ -0,0 +1,24 @@ +# Model API +Model API provides you a set of functions that easily make your model compatible with the `Trainer`, +`Synthesizer` and `ModelZoo`. + +## Base TTS Model + +```{eval-rst} +.. autoclass:: TTS.model.BaseModel + :members: +``` + +## Base `tts` Model + +```{eval-rst} +.. autoclass:: TTS.tts.models.base_tts.BaseTTS + :members: +``` + +## Base `vocoder` Model + +```{eval-rst} +.. autoclass:: TTS.vocoder.models.base_vocoder.BaseVocoder + :members: +``` \ No newline at end of file diff --git a/docs/source/main_classes/speaker_manager.md b/docs/source/main_classes/speaker_manager.md new file mode 100644 index 00000000..ba4b55dc --- /dev/null +++ b/docs/source/main_classes/speaker_manager.md @@ -0,0 +1,11 @@ +# Speaker Manager API + +The {class}`TTS.tts.utils.speakers.SpeakerManager` organize speaker related data and information for 🐸TTS models. It is +especially useful for multi-speaker models. + + +## Speaker Manager +```{eval-rst} +.. automodule:: TTS.tts.utils.speakers + :members: +``` \ No newline at end of file diff --git a/docs/source/main_classes/trainer_api.md b/docs/source/main_classes/trainer_api.md new file mode 100644 index 00000000..a5c3cfb7 --- /dev/null +++ b/docs/source/main_classes/trainer_api.md @@ -0,0 +1,17 @@ +# Trainer API + +The {class}`TTS.trainer.Trainer` provides a lightweight, extensible, and feature-complete training run-time. We optimized it for 🐸 but +can also be used for any DL training in different domains. It supports distributed multi-gpu, mixed-precision (apex or torch.amp) training. + + +## Trainer +```{eval-rst} +.. autoclass:: TTS.trainer.Trainer + :members: +``` + +## TrainingArgs +```{eval-rst} +.. autoclass:: TTS.trainer.TrainingArgs + :members: +``` \ No newline at end of file diff --git a/docs/source/make.bat b/docs/source/make.bat new file mode 100644 index 00000000..922152e9 --- /dev/null +++ b/docs/source/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/training_a_model.md b/docs/source/training_a_model.md new file mode 100644 index 00000000..a7e81f28 --- /dev/null +++ b/docs/source/training_a_model.md @@ -0,0 +1,165 @@ +# Training a Model + +1. Decide what model you want to use. + + Each model has a different set of pros and cons that define the run-time efficiency and the voice quality. It is up to you to decide what model servers your needs. Other than referring to the papers, one easy way is to test the 🐸TTS + community models and see how fast and good each of the models. Or you can start a discussion on our communication channels. + +2. Understand the configuration class, its fields and values of your model. + + For instance, if you want to train a `Tacotron` model then see the `TacotronConfig` class and make sure you understand it. + +3. Go to the recipes and check the recipe of your target model. + + Recipes do not promise perfect models but they provide a good start point for `Nervous Beginners`. A recipe script training + a `GlowTTS` model on `LJSpeech` dataset looks like below. Let's be creative and call this script `train_glowtts.py`. + + ```python + # train_glowtts.py + + import os + + from TTS.tts.configs import GlowTTSConfig + from TTS.tts.configs import BaseDatasetConfig + from TTS.trainer import init_training, Trainer, TrainingArgs + + + output_path = os.path.dirname(os.path.abspath(__file__)) + dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")) + config = GlowTTSConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + text_cleaner="english_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + print_step=25, + print_eval=True, + mixed_precision=False, + output_path=output_path, + datasets=[dataset_config] + ) + args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) + trainer = Trainer(args, config, output_path, c_logger, tb_logger) + trainer.fit() + ``` + + You need to change fields of the `BaseDatasetConfig` to match your own dataset and then update `GlowTTSConfig` + fields as you need. + + 4. Run the training. + + You need to call the python training script. + + ```bash + $ CUDA_VISIBLE_DEVICES="0" python train_glowtts.py + ``` + + Notice that you set the GPU you want to use on your system by setting `CUDA_VISIBLE_DEVICES` environment variable. + To see available GPUs on your system, you can use `nvidia-smi` command on the terminal. + + If you like to run a multi-gpu training + + ```bash + $ CUDA_VISIBLE_DEVICES="0, 1, 2" python TTS/bin/distribute.py --script /train_glowtts.py + ``` + + The example above runs a multi-gpu training using GPUs `0, 1, 2`. + + The beginning of a training run looks like below. + + ```console + > Experiment folder: /your/output_path/-Juni-23-2021_02+52-78899209 + > Using CUDA: True + > Number of GPUs: 1 + > Setting up Audio Processor... + | > sample_rate:22050 + | > resample:False + | > num_mels:80 + | > min_level_db:-100 + | > frame_shift_ms:None + | > frame_length_ms:None + | > ref_level_db:20 + | > fft_size:1024 + | > power:1.5 + | > preemphasis:0.0 + | > griffin_lim_iters:60 + | > signal_norm:True + | > symmetric_norm:True + | > mel_fmin:0 + | > mel_fmax:None + | > spec_gain:20.0 + | > stft_pad_mode:reflect + | > max_norm:4.0 + | > clip_norm:True + | > do_trim_silence:True + | > trim_db:45 + | > do_sound_norm:False + | > stats_path:None + | > base:10 + | > hop_length:256 + | > win_length:1024 + | > Found 13100 files in /your/dataset/path/ljspeech/LJSpeech-1.1 + > Using model: glow_tts + + > Model has 28356129 parameters + + > EPOCH: 0/1000 + + > DataLoader initialization + | > Use phonemes: False + | > Number of instances : 12969 + | > Max length sequence: 187 + | > Min length sequence: 5 + | > Avg length sequence: 98.3403500655409 + | > Num. instances discarded by max-min (max=500, min=3) seq limits: 0 + | > Batch group size: 0. + + > TRAINING (2021-06-23 14:52:54) + + --> STEP: 0/405 -- GLOBAL_STEP: 0 + | > loss: 2.34670 + | > log_mle: 1.61872 + | > loss_dur: 0.72798 + | > align_error: 0.52744 + | > current_lr: 2.5e-07 + | > grad_norm: 5.036039352416992 + | > step_time: 5.8815 + | > loader_time: 0.0065 + ... + ``` + +5. Run the Tensorboard. + + ```bash + $ tensorboard --logdir= + ``` + +6. Check the logs and the Tensorboard and monitor the training. + + On the terminal and Tensorboard, you can monitor the losses and their changes over time. Also Tensorboard provides certain figures and sample outputs. + + Note that different models have different metrics, visuals and outputs to be displayed. + + You should also check the [FAQ page](https://github.com/coqui-ai/TTS/wiki/FAQ) for common problems and solutions + that occur in a training. + +7. Use your best model for inference. + + Use `tts` or `tts-server` commands for testing your models. + + ```bash + $ tts --text "Text for TTS" \ + --model_path path/to/checkpoint_x.pth.tar \ + --config_path path/to/config.json \ + --out_path folder/to/save/output.wav + ``` + +8. Return to the step 1 and reiterate for training a `vocoder` model. + + In the example above, we trained a `GlowTTS` model, but the same workflow applies to all the other 🐸TTS models. diff --git a/docs/source/tts_datasets.md b/docs/source/tts_datasets.md new file mode 100644 index 00000000..6075bc95 --- /dev/null +++ b/docs/source/tts_datasets.md @@ -0,0 +1,16 @@ +# TTS Datasets + +Some of the known public datasets that we successfully applied 🐸TTS: + +- [English - LJ Speech](https://keithito.com/LJ-Speech-Dataset/) +- [English - Nancy](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/) +- [English - TWEB](https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset) +- [English - LibriTTS](https://openslr.org/60/) +- [English - VCTK](https://datashare.ed.ac.uk/handle/10283/2950) +- [Multilingual - M-AI-Labs](http://www.caito.de/2019/01/the-m-ailabs-speech-dataset/) +- [Spanish](https://drive.google.com/file/d/1Sm_zyBo67XHkiFhcRSQ4YaHPYM0slO_e/view?usp=sharing) - thx! @carlfm01 +- [German - Thorsten OGVD](https://github.com/thorstenMueller/deep-learning-german-tts) +- [Japanese - Kokoro](https://www.kaggle.com/kaiida/kokoro-speech-dataset-v11-small/version/1) +- [Chinese](https://www.data-baker.com/open_source.html) + +Let us know if you use 🐸TTS on a different dataset. \ No newline at end of file diff --git a/docs/source/tutorial_for_nervous_beginners.md b/docs/source/tutorial_for_nervous_beginners.md new file mode 100644 index 00000000..015e178d --- /dev/null +++ b/docs/source/tutorial_for_nervous_beginners.md @@ -0,0 +1,175 @@ +# Tutorial For Nervous Beginners + +## Installation + +User friendly installation. Recommended only for synthesizing voice. + +```bash +$ pip install TTS +``` + +Developer friendly installation. + +```bash +$ git clone https://github.com/coqui-ai/TTS +$ cd TTS +$ pip install -e . +``` + +## Training a `tts` Model + +A breakdown of a simple script training a GlowTTS model on LJspeech dataset. See the comments for the explanation of +each line. + +### Pure Python Way + +```python +import os + +# GlowTTSConfig: all model related values for training, validating and testing. +from TTS.tts.configs import GlowTTSConfig + +# BaseDatasetConfig: defines name, formatter and path of the dataset. +from TTS.tts.configs import BaseDatasetConfig + +# init_training: Initialize and setup the training environment. +# Trainer: Where the ✨️ happens. +# TrainingArgs: Defines the set of arguments of the Trainer. +from TTS.trainer import init_training, Trainer, TrainingArgs + +# we use the same path as this script as our training folder. +output_path = os.path.dirname(os.path.abspath(__file__)) + +# set LJSpeech as our target dataset and define its path so that the Trainer knows what data formatter it needs. +dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")) + +# Configure the model. Every config class inherits the BaseTTSConfig to have all the fields defined for the Trainer. +config = GlowTTSConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + text_cleaner="english_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + print_step=25, + print_eval=True, + mixed_precision=False, + output_path=output_path, + datasets=[dataset_config] +) + +# Take the config and the default Trainer arguments, setup the training environment and override the existing +# config values from the terminal. So you can do the following. +# >>> python train.py --coqpit.batch_size 128 +args, config, output_path, _, _, _= init_training(TrainingArgs(), config) + +# Initiate the Trainer. +# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, +# distributed training etc. +trainer = Trainer(args, config, output_path) + +# And kick it 🚀 +trainer.fit() +``` + +### CLI Way + +We still support running training from CLI like in the old days. The same training can be started as follows. + +1. Define your `config.json` + + ```json + { + "model": "glow_tts", + "batch_size": 32, + "eval_batch_size": 16, + "num_loader_workers": 4, + "num_eval_loader_workers": 4, + "run_eval": true, + "test_delay_epochs": -1, + "epochs": 1000, + "text_cleaner": "english_cleaners", + "use_phonemes": false, + "phoneme_language": "en-us", + "phoneme_cache_path": "phoneme_cache", + "print_step": 25, + "print_eval": true, + "mixed_precision": false, + "output_path": "recipes/ljspeech/glow_tts/", + "datasets":[{"name": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}] + } + ``` + +2. Start training. + ```bash + $ CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path config.json + ``` + + + +## Training a `vocoder` Model + +```python +import os + +from TTS.vocoder.configs import HifiganConfig +from TTS.trainer import init_training, Trainer, TrainingArgs + + +output_path = os.path.dirname(os.path.abspath(__file__)) +config = HifiganConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + seq_len=8192, + pad_short=2000, + use_noise_augment=True, + eval_split_size=10, + print_step=25, + print_eval=True, + mixed_precision=False, + lr_gen=1e-4, + lr_disc=1e-4, + # `vocoder` only needs a data path and they read recursively all the `.wav` files underneath. + data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), + output_path=output_path, +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger) +trainer.fit() +``` + +❗️ Note that you can also start the training run from CLI as the `tts` model above. + +## Synthesizing Speech + +You can run `tts` and synthesize speech directly on the terminal. + +```bash +$ tts -h # see the help +$ tts --list_models # list the available models. +``` + +![cli.gif](https://github.com/coqui-ai/TTS/raw/main/images/tts_cli.gif) + + +You can call `tts-server` to start a local demo server that you can open it on +your favorite web browser and 🗣️. + +```bash +$ tts-server -h # see the help +$ tts-server --list_models # list the available models. +``` +![server.gif](https://github.com/coqui-ai/TTS/raw/main/images/demo_server.gif) + + + diff --git a/docs/source/what_makes_a_good_dataset.md b/docs/source/what_makes_a_good_dataset.md new file mode 100644 index 00000000..49a2943b --- /dev/null +++ b/docs/source/what_makes_a_good_dataset.md @@ -0,0 +1,19 @@ +# What makes a good TTS dataset + +## What Makes a Good Dataset +* **Gaussian like distribution on clip and text lengths**. So plot the distribution of clip lengths and check if it covers enough short and long voice clips. +* **Mistake free**. Remove any wrong or broken files. Check annotations, compare transcript and audio length. +* **Noise free**. Background noise might lead your model to struggle, especially for a good alignment. Even if it learns the alignment, the final result is likely to be suboptimial. +* **Compatible tone and pitch among voice clips**. For instance, if you are using audiobook recordings for your project, it might have impersonations for different characters in the book. These differences between samples downgrade the model performance. +* **Good phoneme coverage**. Make sure that your dataset covers a good portion of the phonemes, di-phonemes, and in some languages tri-phonemes. +* **Naturalness of recordings**. For your model WISIAIL (What it sees is all it learns). Therefore, your dataset should accommodate all the attributes you want to hear from your model. + +## Preprocessing Dataset +If you like to use a bespoken dataset, you might like to perform a couple of quality checks before training. 🐸TTS provides a couple of notebooks (CheckSpectrograms, AnalyzeDataset) to expedite this part for you. + +* **AnalyzeDataset** is for checking dataset distribution in terms of the clip and transcript lengths. It is good to find outlier instances (too long, short text but long voice clip, etc.)and remove them before training. Keep in mind that we like to have a good balance between long and short clips to prevent any bias in training. If you have only short clips (1-3 secs), then your model might suffer for long sentences and if your instances are long, then it might not learn the alignment or might take too long to train the model. + +* **CheckSpectrograms** is to measure the noise level of the clips and find good audio processing parameters. The noise level might be observed by checking spectrograms. If spectrograms look cluttered, especially in silent parts, this dataset might not be a good candidate for a TTS project. If your voice clips are too noisy in the background, it makes things harder for your model to learn the alignment, and the final result might be different than the voice you are given. +If the spectrograms look good, then the next step is to find a good set of audio processing parameters, defined in ```config.json```. In the notebook, you can compare different sets of parameters and see the resynthesis results in relation to the given ground-truth. Find the best parameters that give the best possible synthesis performance. + +Another practical detail is the quantization level of the clips. If your dataset has a very high bit-rate, that might cause slow data-load time and consequently slow training. It is better to reduce the sample-rate of your dataset to around 16000-22050. \ No newline at end of file diff --git a/hubconf.py b/hubconf.py index bcbd6fce..96f12b5f 100644 --- a/hubconf.py +++ b/hubconf.py @@ -1,5 +1,5 @@ dependencies = [ - 'torch', 'gdown', 'pysbd', 'phonemizer', 'unidecode', 'pypinyin' + 'torch', 'gdown', 'pysbd', 'gruut', 'anyascii', 'pypinyin', 'coqpit', 'mecab-python3', 'unidic-lite` ] import torch diff --git a/notebooks/ExtractTTSpectrogram.ipynb b/notebooks/ExtractTTSpectrogram.ipynb index dc35e86f..4e42a3bb 100644 --- a/notebooks/ExtractTTSpectrogram.ipynb +++ b/notebooks/ExtractTTSpectrogram.ipynb @@ -22,7 +22,7 @@ "import numpy as np\n", "from tqdm import tqdm as tqdm\n", "from torch.utils.data import DataLoader\n", - "from TTS.tts.datasets.TTSDataset import MyDataset\n", + "from TTS.tts.datasets.TTSDataset import TTSDataset\n", "from TTS.tts.layers.losses import L1LossMasked\n", "from TTS.utils.audio import AudioProcessor\n", "from TTS.utils.io import load_config\n", @@ -112,7 +112,7 @@ "preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n", "preprocessor = getattr(preprocessor, DATASET.lower())\n", "meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n", - "dataset = MyDataset(checkpoint['r'], C.text_cleaner, False, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n", + "dataset = TTSDataset(checkpoint['r'], C.text_cleaner, False, ap, meta_data,characters=c.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n", "loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False)" ] }, diff --git a/notebooks/PlotUmapLibriTTS.ipynb b/notebooks/PlotUmapLibriTTS.ipynb index 97f9800d..ec20383f 100644 --- a/notebooks/PlotUmapLibriTTS.ipynb +++ b/notebooks/PlotUmapLibriTTS.ipynb @@ -13,11 +13,7 @@ }, { "cell_type": "code", -<<<<<<< HEAD - "execution_count": 2, -======= "execution_count": null, ->>>>>>> dev "metadata": {}, "outputs": [], "source": [ @@ -29,13 +25,10 @@ "import umap\n", "\n", "from TTS.speaker_encoder.model import SpeakerEncoder\n", -<<<<<<< HEAD - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.io import load_config\n", -======= - "from TTS.tts.utils.audio import AudioProcessor\n", + "from TTS.utils.audio import AudioProcessor + +\n", "from TTS.tts.utils.generic_utils import load_config\n", ->>>>>>> dev "\n", "from bokeh.io import output_notebook, show\n", "from bokeh.plotting import figure\n", @@ -57,331 +50,9 @@ }, { "cell_type": "code", -<<<<<<< HEAD - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - " \n", - " Loading BokehJS ...\n", - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "\n", - "(function(root) {\n", - " function now() {\n", - " return new Date();\n", - " }\n", - "\n", - " var force = true;\n", - "\n", - " if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n", - " root._bokeh_onload_callbacks = [];\n", - " root._bokeh_is_loading = undefined;\n", - " }\n", - "\n", - " var JS_MIME_TYPE = 'application/javascript';\n", - " var HTML_MIME_TYPE = 'text/html';\n", - " var EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", - " var CLASS_NAME = 'output_bokeh rendered_html';\n", - "\n", - " /**\n", - " * Render data to the DOM node\n", - " */\n", - " function render(props, node) {\n", - " var script = document.createElement(\"script\");\n", - " node.appendChild(script);\n", - " }\n", - "\n", - " /**\n", - " * Handle when an output is cleared or removed\n", - " */\n", - " function handleClearOutput(event, handle) {\n", - " var cell = handle.cell;\n", - "\n", - " var id = cell.output_area._bokeh_element_id;\n", - " var server_id = cell.output_area._bokeh_server_id;\n", - " // Clean up Bokeh references\n", - " if (id != null && id in Bokeh.index) {\n", - " Bokeh.index[id].model.document.clear();\n", - " delete Bokeh.index[id];\n", - " }\n", - "\n", - " if (server_id !== undefined) {\n", - " // Clean up Bokeh references\n", - " var cmd = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", - " cell.notebook.kernel.execute(cmd, {\n", - " iopub: {\n", - " output: function(msg) {\n", - " var id = msg.content.text.trim();\n", - " if (id in Bokeh.index) {\n", - " Bokeh.index[id].model.document.clear();\n", - " delete Bokeh.index[id];\n", - " }\n", - " }\n", - " }\n", - " });\n", - " // Destroy server and session\n", - " var cmd = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", - " cell.notebook.kernel.execute(cmd);\n", - " }\n", - " }\n", - "\n", - " /**\n", - " * Handle when a new output is added\n", - " */\n", - " function handleAddOutput(event, handle) {\n", - " var output_area = handle.output_area;\n", - " var output = handle.output;\n", - "\n", - " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", - " if ((output.output_type != \"display_data\") || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n", - " return\n", - " }\n", - "\n", - " var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", - "\n", - " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", - " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n", - " // store reference to embed id on output_area\n", - " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", - " }\n", - " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", - " var bk_div = document.createElement(\"div\");\n", - " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", - " var script_attrs = bk_div.children[0].attributes;\n", - " for (var i = 0; i < script_attrs.length; i++) {\n", - " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", - " }\n", - " // store reference to server id on output_area\n", - " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", - " }\n", - " }\n", - "\n", - " function register_renderer(events, OutputArea) {\n", - "\n", - " function append_mime(data, metadata, element) {\n", - " // create a DOM node to render to\n", - " var toinsert = this.create_output_subarea(\n", - " metadata,\n", - " CLASS_NAME,\n", - " EXEC_MIME_TYPE\n", - " );\n", - " this.keyboard_manager.register_events(toinsert);\n", - " // Render to node\n", - " var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", - " render(props, toinsert[toinsert.length - 1]);\n", - " element.append(toinsert);\n", - " return toinsert\n", - " }\n", - "\n", - " /* Handle when an output is cleared or removed */\n", - " events.on('clear_output.CodeCell', handleClearOutput);\n", - " events.on('delete.Cell', handleClearOutput);\n", - "\n", - " /* Handle when a new output is added */\n", - " events.on('output_added.OutputArea', handleAddOutput);\n", - "\n", - " /**\n", - " * Register the mime type and append_mime function with output_area\n", - " */\n", - " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", - " /* Is output safe? */\n", - " safe: true,\n", - " /* Index of renderer in `output_area.display_order` */\n", - " index: 0\n", - " });\n", - " }\n", - "\n", - " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", - " if (root.Jupyter !== undefined) {\n", - " var events = require('base/js/events');\n", - " var OutputArea = require('notebook/js/outputarea').OutputArea;\n", - "\n", - " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", - " register_renderer(events, OutputArea);\n", - " }\n", - " }\n", - "\n", - " \n", - " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", - " root._bokeh_timeout = Date.now() + 5000;\n", - " root._bokeh_failed_load = false;\n", - " }\n", - "\n", - " var NB_LOAD_WARNING = {'data': {'text/html':\n", - " \"
\\n\"+\n", - " \"

\\n\"+\n", - " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", - " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", - " \"

\\n\"+\n", - " \"
    \\n\"+\n", - " \"
  • re-rerun `output_notebook()` to attempt to load from CDN again, or
  • \\n\"+\n", - " \"
  • use INLINE resources instead, as so:
  • \\n\"+\n", - " \"
\\n\"+\n", - " \"\\n\"+\n", - " \"from bokeh.resources import INLINE\\n\"+\n", - " \"output_notebook(resources=INLINE)\\n\"+\n", - " \"\\n\"+\n", - " \"
\"}};\n", - "\n", - " function display_loaded() {\n", - " var el = document.getElementById(\"1001\");\n", - " if (el != null) {\n", - " el.textContent = \"BokehJS is loading...\";\n", - " }\n", - " if (root.Bokeh !== undefined) {\n", - " if (el != null) {\n", - " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n", - " }\n", - " } else if (Date.now() < root._bokeh_timeout) {\n", - " setTimeout(display_loaded, 100)\n", - " }\n", - " }\n", - "\n", - "\n", - " function run_callbacks() {\n", - " try {\n", - " root._bokeh_onload_callbacks.forEach(function(callback) {\n", - " if (callback != null)\n", - " callback();\n", - " });\n", - " } finally {\n", - " delete root._bokeh_onload_callbacks\n", - " }\n", - " console.debug(\"Bokeh: all callbacks have finished\");\n", - " }\n", - "\n", - " function load_libs(css_urls, js_urls, callback) {\n", - " if (css_urls == null) css_urls = [];\n", - " if (js_urls == null) js_urls = [];\n", - "\n", - " root._bokeh_onload_callbacks.push(callback);\n", - " if (root._bokeh_is_loading > 0) {\n", - " console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", - " return null;\n", - " }\n", - " if (js_urls == null || js_urls.length === 0) {\n", - " run_callbacks();\n", - " return null;\n", - " }\n", - " console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", - " root._bokeh_is_loading = css_urls.length + js_urls.length;\n", - "\n", - " function on_load() {\n", - " root._bokeh_is_loading--;\n", - " if (root._bokeh_is_loading === 0) {\n", - " console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n", - " run_callbacks()\n", - " }\n", - " }\n", - "\n", - " function on_error() {\n", - " console.error(\"failed to load \" + url);\n", - " }\n", - "\n", - " for (var i = 0; i < css_urls.length; i++) {\n", - " var url = css_urls[i];\n", - " const element = document.createElement(\"link\");\n", - " element.onload = on_load;\n", - " element.onerror = on_error;\n", - " element.rel = \"stylesheet\";\n", - " element.type = \"text/css\";\n", - " element.href = url;\n", - " console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n", - " document.body.appendChild(element);\n", - " }\n", - "\n", - " for (var i = 0; i < js_urls.length; i++) {\n", - " var url = js_urls[i];\n", - " var element = document.createElement('script');\n", - " element.onload = on_load;\n", - " element.onerror = on_error;\n", - " element.async = false;\n", - " element.src = url;\n", - " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", - " document.head.appendChild(element);\n", - " }\n", - " };var element = document.getElementById(\"1001\");\n", - " if (element == null) {\n", - " console.error(\"Bokeh: ERROR: autoload.js configured with elementid '1001' but no matching script tag was found. \")\n", - " return false;\n", - " }\n", - "\n", - " function inject_raw_css(css) {\n", - " const element = document.createElement(\"style\");\n", - " element.appendChild(document.createTextNode(css));\n", - " document.body.appendChild(element);\n", - " }\n", - "\n", - " \n", - " var js_urls = [\"https://cdn.pydata.org/bokeh/release/bokeh-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-tables-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-gl-1.4.0.min.js\"];\n", - " var css_urls = [];\n", - " \n", - "\n", - " var inline_js = [\n", - " function(Bokeh) {\n", - " Bokeh.set_log_level(\"info\");\n", - " },\n", - " function(Bokeh) {\n", - " \n", - " \n", - " }\n", - " ];\n", - "\n", - " function run_inline_js() {\n", - " \n", - " if (root.Bokeh !== undefined || force === true) {\n", - " \n", - " for (var i = 0; i < inline_js.length; i++) {\n", - " inline_js[i].call(root, root.Bokeh);\n", - " }\n", - " if (force === true) {\n", - " display_loaded();\n", - " }} else if (Date.now() < root._bokeh_timeout) {\n", - " setTimeout(run_inline_js, 100);\n", - " } else if (!root._bokeh_failed_load) {\n", - " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", - " root._bokeh_failed_load = true;\n", - " } else if (force !== true) {\n", - " var cell = $(document.getElementById(\"1001\")).parents('.cell').data().cell;\n", - " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", - " }\n", - "\n", - " }\n", - "\n", - " if (root._bokeh_is_loading === 0) {\n", - " console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", - " run_inline_js();\n", - " } else {\n", - " load_libs(css_urls, js_urls, function() {\n", - " console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n", - " run_inline_js();\n", - " });\n", - " }\n", - "}(window));" - ], - "application/vnd.bokehjs_load.v0+json": "\n(function(root) {\n function now() {\n return new Date();\n }\n\n var force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n \n\n \n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n var NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"
    \\n\"+\n \"
  • re-rerun `output_notebook()` to attempt to load from CDN again, or
  • \\n\"+\n \"
  • use INLINE resources instead, as so:
  • \\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n var el = document.getElementById(\"1001\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error() {\n console.error(\"failed to load \" + url);\n }\n\n for (var i = 0; i < css_urls.length; i++) {\n var url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error;\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n for (var i = 0; i < js_urls.length; i++) {\n var url = js_urls[i];\n var element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error;\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };var element = document.getElementById(\"1001\");\n if (element == null) {\n console.error(\"Bokeh: ERROR: autoload.js configured with elementid '1001' but no matching script tag was found. \")\n return false;\n }\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n \n var js_urls = [\"https://cdn.pydata.org/bokeh/release/bokeh-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-tables-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-gl-1.4.0.min.js\"];\n var css_urls = [];\n \n\n var inline_js = [\n function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\n function(Bokeh) {\n \n \n }\n ];\n\n function run_inline_js() {\n \n if (root.Bokeh !== undefined || force === true) {\n \n for (var i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\n if (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n var cell = $(document.getElementById(\"1001\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));" - }, - "metadata": {}, - "output_type": "display_data" - } - ], -======= "execution_count": null, "metadata": {}, "outputs": [], ->>>>>>> dev "source": [ "output_notebook()" ] @@ -395,20 +66,11 @@ }, { "cell_type": "code", -<<<<<<< HEAD - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "#MODEL_RUN_PATH = \"libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n", - "MODEL_RUN_PATH = \"libritts_360-half-September-28-2019_10+46AM-8565c50/\"\n", -======= "execution_count": null, "metadata": {}, "outputs": [], "source": [ "MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n", ->>>>>>> dev "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n", "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n", "\n", @@ -418,81 +80,23 @@ "\n", "# My multi speaker locations\n", "EMBED_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360-embed_128/\"\n", -<<<<<<< HEAD - "AUDIO_PATH = \"datasets/LibriTTS/test-clean/\"" -======= "AUDIO_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360/\"" ->>>>>>> dev ] }, { "cell_type": "code", -<<<<<<< HEAD - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "best_model.pth.tar\r\n", - "config.json\r\n", - "events.out.tfevents.1569660396.erogol-desktop\r\n" - ] - } - ], -======= "execution_count": null, "metadata": {}, "outputs": [], ->>>>>>> dev "source": [ "!ls -1 $MODEL_RUN_PATH" ] }, { "cell_type": "code", -<<<<<<< HEAD - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " > Setting up Audio Processor...\n", - " | > sample_rate:16000\n", - " | > num_mels:40\n", - " | > min_level_db:-100\n", - " | > frame_shift_ms:12.5\n", - " | > frame_length_ms:50\n", - " | > ref_level_db:20\n", - " | > fft_size:1024\n", - " | > power:None\n", - " | > preemphasis:0.98\n", - " | > griffin_lim_iters:None\n", - " | > signal_norm:True\n", - " | > symmetric_norm:True\n", - " | > mel_fmin:0\n", - " | > mel_fmax:8000.0\n", - " | > spec_gain:20.0\n", - " | > stft_pad_mode:reflect\n", - " | > max_norm:4.0\n", - " | > clip_norm:True\n", - " | > do_trim_silence:False\n", - " | > trim_db:60\n", - " | > do_sound_norm:False\n", - " | > stats_path:None\n", - " | > hop_length:200\n", - " | > win_length:800\n" - ] - } - ], -======= "execution_count": null, "metadata": {}, "outputs": [], ->>>>>>> dev "source": [ "CONFIG = load_config(CONFIG_PATH)\n", "ap = AudioProcessor(**CONFIG['audio'])" @@ -507,23 +111,9 @@ }, { "cell_type": "code", -<<<<<<< HEAD - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Embeddings found: 0\n" - ] - } - ], -======= "execution_count": null, "metadata": {}, "outputs": [], ->>>>>>> dev "source": [ "embed_files = glob.glob(EMBED_PATH+\"/**/*.npy\", recursive=True)\n", "print(f'Embeddings found: {len(embed_files)}')" @@ -538,27 +128,9 @@ }, { "cell_type": "code", -<<<<<<< HEAD - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "ename": "IndexError", - "evalue": "list index out of range", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0membed_files\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mIndexError\u001b[0m: list index out of range" - ] - } - ], -======= "execution_count": null, "metadata": {}, "outputs": [], ->>>>>>> dev "source": [ "embed_files[0]" ] @@ -574,23 +146,9 @@ }, { "cell_type": "code", -<<<<<<< HEAD - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Speaker count: 0\n" - ] - } - ], -======= "execution_count": null, "metadata": {}, "outputs": [], ->>>>>>> dev "source": [ "speaker_paths = list(set([os.path.dirname(os.path.dirname(embed_file)) for embed_file in embed_files]))\n", "speaker_to_utter = {}\n", @@ -614,32 +172,11 @@ }, { "cell_type": "code", -<<<<<<< HEAD - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "ename": "ValueError", - "evalue": "'a' cannot be empty unless no samples are taken", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 14\u001b[0;31m \u001b[0mspeaker_idxs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchoice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspeaker_paths\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_speakers\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 15\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mspeaker_num\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_idx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspeaker_idxs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32mmtrand.pyx\u001b[0m in \u001b[0;36mnumpy.random.mtrand.RandomState.choice\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mValueError\u001b[0m: 'a' cannot be empty unless no samples are taken" - ] - } - ], - "source": [ - "ttsembeds = []\n", -======= "execution_count": null, "metadata": {}, "outputs": [], "source": [ "embeds = []\n", ->>>>>>> dev "labels = []\n", "locations = []\n", "\n", @@ -663,11 +200,7 @@ " embed = np.load(embed_path)\n", " embeds.append(embed)\n", " labels.append(str(speaker_num))\n", -<<<<<<< HEAD - " #locations.append(embed_path.replace(EMBED_PATH, '').replace('.npy','.wav'))\n", -======= " locations.append(embed_path.replace(EMBED_PATH, '').replace('.npy','.wav'))\n", ->>>>>>> dev "embeds = np.concatenate(embeds)" ] }, @@ -680,27 +213,9 @@ }, { "cell_type": "code", -<<<<<<< HEAD - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "module 'umap' has no attribute 'UMAP'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mumap\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mUMAP\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprojection\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0membeds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mAttributeError\u001b[0m: module 'umap' has no attribute 'UMAP'" - ] - } - ], -======= "execution_count": null, "metadata": {}, "outputs": [], ->>>>>>> dev "source": [ "model = umap.UMAP()\n", "projection = model.fit_transform(embeds)" @@ -804,11 +319,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", -<<<<<<< HEAD - "version": "3.8.5" -======= "version": "3.7.4" ->>>>>>> dev } }, "nbformat": 4, diff --git a/notebooks/TestAttention.ipynb b/notebooks/TestAttention.ipynb index ed1c245b..5d8eed85 100644 --- a/notebooks/TestAttention.ipynb +++ b/notebooks/TestAttention.ipynb @@ -37,7 +37,9 @@ "import librosa.display\n", "\n", "from TTS.tts.layers import *\n", - "from TTS.tts.utils.audio import AudioProcessor\n", + "from TTS.utils.audio import AudioProcessor + +\n", "from TTS.tts.utils.generic_utils import setup_model\n", "from TTS.tts.utils.io import load_config\n", "from TTS.tts.utils.text import text_to_sequence\n", diff --git a/notebooks/dataset_analysis/AnalyzeDataset.ipynb b/notebooks/dataset_analysis/AnalyzeDataset.ipynb index 8aa3a025..6ff2d2ca 100644 --- a/notebooks/dataset_analysis/AnalyzeDataset.ipynb +++ b/notebooks/dataset_analysis/AnalyzeDataset.ipynb @@ -31,7 +31,7 @@ "from multiprocessing import Pool\n", "from matplotlib import pylab as plt\n", "from collections import Counter\n", - "from TTS.tts.datasets.preprocess import *\n", + "from TTS.tts.datasets.formatters import *\n", "%matplotlib inline" ] }, diff --git a/notebooks/dataset_analysis/PhonemeCoverage.ipynb b/notebooks/dataset_analysis/PhonemeCoverage.ipynb index f9540d06..e659511a 100644 --- a/notebooks/dataset_analysis/PhonemeCoverage.ipynb +++ b/notebooks/dataset_analysis/PhonemeCoverage.ipynb @@ -50,7 +50,7 @@ "source": [ "# import stuff\n", "from TTS.utils.io import load_config\n", - "from TTS.tts.datasets.preprocess import load_meta_data\n", + "from TTS.tts.datasets.formatters import load_meta_data\n", "from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme\n", "from tqdm import tqdm\n", "from matplotlib import pylab as plt\n", diff --git a/pyproject.toml b/pyproject.toml index feaf5fd4..0941a906 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools", "wheel", "Cython", "numpy==1.18.5"] +requires = ["setuptools", "wheel", "Cython", "numpy==1.19.5"] [flake8] max-line-length=120 diff --git a/recipes/kokoro/tacotron2-DDC/run.sh b/recipes/kokoro/tacotron2-DDC/run.sh index 86fda642..69800cf7 100644 --- a/recipes/kokoro/tacotron2-DDC/run.sh +++ b/recipes/kokoro/tacotron2-DDC/run.sh @@ -16,8 +16,8 @@ tail -n 812 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_val.c python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/$CORPUS/wavs/ # training .... # change the GPU id if needed -CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tacotron.py --config_path $RUN_DIR/tacotron2-DDC.json \ - --coqpit.output_path $RUN_DIR \ - --coqpit.datasets.0.path $RUN_DIR/$CORPUS \ - --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ - --coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \ \ No newline at end of file +CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path $RUN_DIR/tacotron2-DDC.json \ + --coqpit.output_path $RUN_DIR \ + --coqpit.datasets.0.path $RUN_DIR/$CORPUS \ + --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ + --coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \ \ No newline at end of file diff --git a/recipes/ljspeech/README.md b/recipes/ljspeech/README.md new file mode 100644 index 00000000..94508a7f --- /dev/null +++ b/recipes/ljspeech/README.md @@ -0,0 +1,19 @@ +# 🐸💬 TTS LJspeech Recipes + +For running the recipes + +1. Download the LJSpeech dataset here either manually from [its official website](https://keithito.com/LJ-Speech-Dataset/) or using ```download_ljspeech.sh```. +2. Go to your desired model folder and run the training. + + Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```) + ```terminal + CUDA_VISIBLE_DEVICES="0" python train_modelX.py + ``` + + Running bash scripts. + ```terminal + bash run.sh + ``` + +💡 Note that these runs are just templates to help you start training your first model. They are not optimized for the best +result. Double-check the configurations and feel free to share your experiments to find better parameters together 💪. diff --git a/recipes/ljspeech/align_tts/train_aligntts.py b/recipes/ljspeech/align_tts/train_aligntts.py new file mode 100644 index 00000000..4a4f86c4 --- /dev/null +++ b/recipes/ljspeech/align_tts/train_aligntts.py @@ -0,0 +1,30 @@ +import os + +from TTS.tts.configs import AlignTTSConfig +from TTS.tts.configs import BaseDatasetConfig +from TTS.trainer import init_training, Trainer, TrainingArgs + + +output_path = os.path.dirname(os.path.abspath(__file__)) +dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")) +config = AlignTTSConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + text_cleaner="english_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + print_step=25, + print_eval=True, + mixed_precision=False, + output_path=output_path, + datasets=[dataset_config] +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger) +trainer.fit() diff --git a/recipes/ljspeech/download_ljspeech.sh b/recipes/ljspeech/download_ljspeech.sh new file mode 100644 index 00000000..14ef058d --- /dev/null +++ b/recipes/ljspeech/download_ljspeech.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# take the scripts's parent's directory to prefix all the output paths. +RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +echo $RUN_DIR +# download LJSpeech dataset +wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 +# extract +tar -xjf LJSpeech-1.1.tar.bz2 +# create train-val splits +shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv +head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv +tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv +mv LJSpeech-1.1 $RUN_DIR/ +rm LJSpeech-1.1.tar.bz2 \ No newline at end of file diff --git a/recipes/ljspeech/glow_tts/train_glowtts.py b/recipes/ljspeech/glow_tts/train_glowtts.py new file mode 100644 index 00000000..0a3c3838 --- /dev/null +++ b/recipes/ljspeech/glow_tts/train_glowtts.py @@ -0,0 +1,30 @@ +import os + +from TTS.tts.configs import GlowTTSConfig +from TTS.tts.configs import BaseDatasetConfig +from TTS.trainer import init_training, Trainer, TrainingArgs + + +output_path = os.path.dirname(os.path.abspath(__file__)) +dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")) +config = GlowTTSConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + text_cleaner="english_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + print_step=25, + print_eval=True, + mixed_precision=False, + output_path=output_path, + datasets=[dataset_config] +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger) +trainer.fit() diff --git a/recipes/ljspeech/hifigan/train_hifigan.py b/recipes/ljspeech/hifigan/train_hifigan.py new file mode 100644 index 00000000..af615ace --- /dev/null +++ b/recipes/ljspeech/hifigan/train_hifigan.py @@ -0,0 +1,29 @@ +import os + +from TTS.trainer import Trainer, TrainingArgs, init_training +from TTS.vocoder.configs import HifiganConfig + +output_path = os.path.dirname(os.path.abspath(__file__)) +config = HifiganConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + seq_len=8192, + pad_short=2000, + use_noise_augment=True, + eval_split_size=10, + print_step=25, + print_eval=True, + mixed_precision=False, + lr_gen=1e-4, + lr_disc=1e-4, + data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), + output_path=output_path, +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger) +trainer.fit() diff --git a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py new file mode 100644 index 00000000..6b766ab7 --- /dev/null +++ b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py @@ -0,0 +1,30 @@ +import os + +from TTS.vocoder.configs import MultibandMelganConfig +from TTS.trainer import init_training, Trainer, TrainingArgs + + +output_path = os.path.dirname(os.path.abspath(__file__)) +config = MultibandMelganConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + seq_len=8192, + pad_short=2000, + use_noise_augment=True, + eval_split_size=10, + print_step=25, + print_eval=True, + mixed_precision=False, + lr_gen=1e-4, + lr_disc=1e-4, + data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), + output_path=output_path, +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger) +trainer.fit() diff --git a/recipes/ljspeech/tacotron2-DCA/run.sh b/recipes/ljspeech/tacotron2-DCA/run.sh new file mode 100644 index 00000000..8bcd9e3d --- /dev/null +++ b/recipes/ljspeech/tacotron2-DCA/run.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# take the scripts's parent's directory to prefix all the output paths. +RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +echo $RUN_DIR +# # download LJSpeech dataset +# wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 +# # extract +# tar -xjf LJSpeech-1.1.tar.bz2 +# # create train-val splits +# shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv +# head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv +# tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv +# mv LJSpeech-1.1 $RUN_DIR/ +# rm LJSpeech-1.1.tar.bz2 +# # compute dataset mean and variance for normalization +# python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/LJSpeech-1.1/wavs/ +# training .... +# change the GPU id if needed +CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path $RUN_DIR/tacotron2-DCA.json \ + --coqpit.output_path $RUN_DIR \ + --coqpit.datasets.0.path /media/erogol/nvme_linux/gdrive/Projects/TTS/recipes/ljspeech/tacotron2-DDC/LJSpeech-1.1/ \ + --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ \ No newline at end of file diff --git a/recipes/ljspeech/tacotron2-DCA/scale_stats.npy b/recipes/ljspeech/tacotron2-DCA/scale_stats.npy new file mode 100644 index 00000000..1dc577a6 Binary files /dev/null and b/recipes/ljspeech/tacotron2-DCA/scale_stats.npy differ diff --git a/recipes/ljspeech/tacotron2-DCA/tacotron2-DCA.json b/recipes/ljspeech/tacotron2-DCA/tacotron2-DCA.json new file mode 100644 index 00000000..c5b6fa52 --- /dev/null +++ b/recipes/ljspeech/tacotron2-DCA/tacotron2-DCA.json @@ -0,0 +1,85 @@ +{ + "datasets": [ + { + "name": "ljspeech", + "path": "DEFINE THIS", + "meta_file_train": "metadata.csv", + "meta_file_val": null + } + ], + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_length_ms": null, + "frame_shift_ms": null, + "sample_rate": 22050, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_trim_silence": true, + "trim_db": 60, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 50.0, + "mel_fmax": 7600.0, + "spec_gain": 1, + "signal_norm": true, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": "scale_stats.npy" + }, + "distributed_backend": "nlcc", + "distributed_url": "tcp:\/\/localhost:54321", + "model": "Tacotron2", + "run_name": "ljspeech-dca", + "run_description": "tacotron2 with dynamic conv attention.", + "batch_size": 64, + "eval_batch_size": 16, + "mixed_precision": true, + "loss_masking": true, + "decoder_loss_alpha": 0.25, + "postnet_loss_alpha": 0.25, + "postnet_diff_spec_alpha": 0.25, + "decoder_diff_spec_alpha": 0.25, + "decoder_ssim_alpha": 0.25, + "postnet_ssim_alpha": 0.25, + "ga_alpha": 5.0, + "stopnet_pos_weight": 15.0, + "run_eval": true, + "test_delay_epochs": 10, + "max_decoder_steps": 50, + "noam_schedule": true, + "grad_clip": 0.05, + "epochs": 1000, + "lr": 0.001, + "wd": 1e-06, + "warmup_steps": 4000, + "memory_size": -1, + "prenet_type": "original", + "prenet_dropout": true, + "attention_type": "dynamic_convolution", + "location_attn": true, + "attention_norm": "sigmoid", + "r": 2, + "stopnet": true, + "separate_stopnet": true, + "print_step": 25, + "tb_plot_step": 100, + "print_eval": false, + "save_step": 10000, + "checkpoint": true, + "text_cleaner": "phoneme_cleaners", + "num_loader_workers": 4, + "num_val_loader_workers": 4, + "batch_group_size": 4, + "min_seq_len": 6, + "max_seq_len": 180, + "compute_input_seq_cache": true, + "output_path": "DEFINE THIS", + "phoneme_cache_path": "DEFINE THIS", + "use_phonemes": false, + "phoneme_language": "en-us" +} diff --git a/recipes/ljspeech/tacotron2-DDC/run.sh b/recipes/ljspeech/tacotron2-DDC/run.sh index eaa05b60..dd36454f 100644 --- a/recipes/ljspeech/tacotron2-DDC/run.sh +++ b/recipes/ljspeech/tacotron2-DDC/run.sh @@ -16,7 +16,7 @@ rm LJSpeech-1.1.tar.bz2 python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/LJSpeech-1.1/wavs/ # training .... # change the GPU id if needed -CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tacotron.py --config_path $RUN_DIR/tacotron2-DDC.json \ - --coqpit.output_path $RUN_DIR \ - --coqpit.datasets.0.path $RUN_DIR/LJSpeech-1.1/ \ - --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ \ No newline at end of file +CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path $RUN_DIR/tacotron2-DDC.json \ + --coqpit.output_path $RUN_DIR \ + --coqpit.datasets.0.path $RUN_DIR/LJSpeech-1.1/ \ + --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ \ No newline at end of file diff --git a/recipes/ljspeech/tacotron2-DDC/scale_stats.npy b/recipes/ljspeech/tacotron2-DDC/scale_stats.npy new file mode 100644 index 00000000..1dc577a6 Binary files /dev/null and b/recipes/ljspeech/tacotron2-DDC/scale_stats.npy differ diff --git a/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json b/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json index 9cdbbd3b..d787c138 100644 --- a/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json +++ b/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json @@ -1,4 +1,5 @@ { + "model": "Tacotron2", "datasets": [ { "name": "ljspeech", @@ -36,12 +37,13 @@ "gst_num_heads": 4, "gst_num_style_tokens": 10 }, - "model": "Tacotron2", + "distributed_backend": "gloo", + "distributed_url": "tcp:\/\/localhost:54321", "run_name": "ljspeech-ddc", "run_description": "tacotron2 with double decoder consistency.", "batch_size": 64, "eval_batch_size": 16, - "mixed_precision": true, + "mixed_precision": false, "loss_masking": true, "decoder_loss_alpha": 0.25, "postnet_loss_alpha": 0.25, @@ -54,6 +56,7 @@ "run_eval": true, "test_delay_epochs": 10, "test_sentences_file": null, + "max_decoder_steps": 50, "noam_schedule": true, "grad_clip": 0.05, "epochs": 1000, @@ -88,4 +91,4 @@ "phoneme_cache_path": "DEFINE THIS", "use_phonemes": false, "phoneme_language": "en-us" -} \ No newline at end of file +} diff --git a/recipes/ljspeech/univnet/train.py b/recipes/ljspeech/univnet/train.py new file mode 100644 index 00000000..a442b451 --- /dev/null +++ b/recipes/ljspeech/univnet/train.py @@ -0,0 +1,30 @@ +import os + +from TTS.config.shared_configs import BaseAudioConfig +from TTS.trainer import Trainer, TrainingArgs, init_training +from TTS.vocoder.configs import UnivnetConfig + +output_path = os.path.dirname(os.path.abspath(__file__)) +config = UnivnetConfig( + batch_size=64, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + seq_len=8192, + pad_short=2000, + use_noise_augment=True, + eval_split_size=10, + print_step=25, + print_eval=False, + mixed_precision=False, + lr_gen=1e-4, + lr_disc=1e-4, + data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), + output_path=output_path, +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger) +trainer.fit() diff --git a/recipes/ljspeech/wavegrad/train_wavegrad.py b/recipes/ljspeech/wavegrad/train_wavegrad.py new file mode 100644 index 00000000..323b2bb7 --- /dev/null +++ b/recipes/ljspeech/wavegrad/train_wavegrad.py @@ -0,0 +1,29 @@ +import os + +from TTS.trainer import Trainer, init_training +from TTS.trainer import TrainingArgs +from TTS.vocoder.configs import WavegradConfig + + +output_path = os.path.dirname(os.path.abspath(__file__)) +config = WavegradConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + seq_len=6144, + pad_short=2000, + use_noise_augment=True, + eval_split_size=50, + print_step=50, + print_eval=True, + mixed_precision=False, + data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), + output_path=output_path, +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger) +trainer.fit() diff --git a/recipes/ljspeech/wavernn/train_wavernn.py b/recipes/ljspeech/wavernn/train_wavernn.py new file mode 100644 index 00000000..76ff722a --- /dev/null +++ b/recipes/ljspeech/wavernn/train_wavernn.py @@ -0,0 +1,30 @@ +import os + +from TTS.trainer import Trainer, init_training, TrainingArgs +from TTS.vocoder.configs import WavernnConfig + + +output_path = os.path.dirname(os.path.abspath(__file__)) +config = WavernnConfig( + batch_size=64, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=10000, + seq_len=1280, + pad_short=2000, + use_noise_augment=False, + eval_split_size=10, + print_step=25, + print_eval=True, + mixed_precision=False, + lr=1e-4, + grad_clip=4, + data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), + output_path=output_path, +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger, cudnn_benchmark=True) +trainer.fit() diff --git a/requirements.tf.txt b/requirements.tf.txt index 60f6e6c9..8e256a90 100644 --- a/requirements.tf.txt +++ b/requirements.tf.txt @@ -1 +1 @@ -tensorflow==2.3.1 +tensorflow==2.5.0 diff --git a/requirements.txt b/requirements.txt index 046139d0..d5624c3b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ inflect jieba librosa==0.8.0 matplotlib -numpy==1.18.5 +numpy==1.19.5 pandas pypinyin pysbd @@ -15,8 +15,8 @@ soundfile tensorboardX torch>=1.7 tqdm -numba==0.52 -umap-learn==0.4.6 +numba==0.53 +umap-learn==0.5.1 anyascii coqpit # japanese g2p deps diff --git a/setup.py b/setup.py index 7cfb6519..bd6a6aae 100644 --- a/setup.py +++ b/setup.py @@ -11,9 +11,8 @@ import setuptools.command.develop from Cython.Build import cythonize from setuptools import Extension, find_packages, setup - -if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) > LooseVersion("3.9"): - raise RuntimeError("TTS requires python >= 3.6 and <3.9 " "but your Python version is {}".format(sys.version)) +if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) > LooseVersion("3.10"): + raise RuntimeError("TTS requires python >= 3.6 and <=3.10 " "but your Python version is {}".format(sys.version)) cwd = os.path.dirname(os.path.abspath(__file__)) @@ -99,7 +98,7 @@ setup( "notebooks": requirements_notebooks, "tf": requirements_tf, }, - python_requires=">=3.6.0, <3.9", + python_requires=">=3.6.0, <3.10", entry_points={"console_scripts": ["tts=TTS.bin.synthesize:main", "tts-server = TTS.server.server:main"]}, classifiers=[ "Programming Language :: Python", @@ -107,6 +106,7 @@ setup( "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", "Development Status :: 3 - Alpha", "Intended Audience :: Science/Research", "Intended Audience :: Developers", diff --git a/tests/data/ljspeech/metadata.csv b/tests/data/ljspeech/metadata.csv index 8f7832b5..6c65ca0d 100644 --- a/tests/data/ljspeech/metadata.csv +++ b/tests/data/ljspeech/metadata.csv @@ -6,27 +6,3 @@ LJ001-0005|the invention of movable metal letters in the middle of the fifteenth LJ001-0006|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography, LJ001-0007|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five, LJ001-0008|has never been surpassed.|has never been surpassed. -LJ001-0009|Printing, then, for our purpose, may be considered as the art of making books by means of movable types.|Printing, then, for our purpose, may be considered as the art of making books by means of movable types. -LJ001-0010|Now, as all books not primarily intended as picture-books consist principally of types composed to form letterpress,|Now, as all books not primarily intended as picture-books consist principally of types composed to form letterpress, -LJ001-0011|it is of the first importance that the letter used should be fine in form;|it is of the first importance that the letter used should be fine in form; -LJ001-0012|especially as no more time is occupied, or cost incurred, in casting, setting, or printing beautiful letters|especially as no more time is occupied, or cost incurred, in casting, setting, or printing beautiful letters -LJ001-0013|than in the same operations with ugly ones.|than in the same operations with ugly ones. -LJ001-0014|And it was a matter of course that in the Middle Ages, when the craftsmen took care that beautiful form should always be a part of their productions whatever they were,|And it was a matter of course that in the Middle Ages, when the craftsmen took care that beautiful form should always be a part of their productions whatever they were, -LJ001-0015|the forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves.|the forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. -LJ001-0016|The Middle Ages brought calligraphy to perfection, and it was natural therefore|The Middle Ages brought calligraphy to perfection, and it was natural therefore -LJ001-0017|that the forms of printed letters should follow more or less closely those of the written character, and they followed them very closely.|that the forms of printed letters should follow more or less closely those of the written character, and they followed them very closely. -LJ001-0018|The first books were printed in black letter, i.e. the letter which was a Gothic development of the ancient Roman character,|The first books were printed in black letter, i.e. the letter which was a Gothic development of the ancient Roman character, -LJ001-0019|and which developed more completely and satisfactorily on the side of the "lower-case" than the capital letters;|and which developed more completely and satisfactorily on the side of the "lower-case" than the capital letters; -LJ001-0020|the "lower-case" being in fact invented in the early Middle Ages.|the "lower-case" being in fact invented in the early Middle Ages. -LJ001-0021|The earliest book printed with movable type, the aforesaid Gutenberg Bible, is printed in letters which are an exact imitation|The earliest book printed with movable type, the aforesaid Gutenberg Bible, is printed in letters which are an exact imitation -LJ001-0022|of the more formal ecclesiastical writing which obtained at that time; this has since been called "missal type,"|of the more formal ecclesiastical writing which obtained at that time; this has since been called "missal type," -LJ001-0023|and was in fact the kind of letter used in the many splendid missals, psalters, etc., produced by printing in the fifteenth century.|and was in fact the kind of letter used in the many splendid missals, psalters, etc., produced by printing in the fifteenth century. -LJ001-0024|But the first Bible actually dated (which also was printed at Maintz by Peter Schoeffer in the year 1462)|But the first Bible actually dated (which also was printed at Maintz by Peter Schoeffer in the year fourteen sixty-two) -LJ001-0025|imitates a much freer hand, simpler, rounder, and less spiky, and therefore far pleasanter and easier to read.|imitates a much freer hand, simpler, rounder, and less spiky, and therefore far pleasanter and easier to read. -LJ001-0026|On the whole the type of this book may be considered the ne-plus-ultra of Gothic type,|On the whole the type of this book may be considered the ne-plus-ultra of Gothic type, -LJ001-0027|especially as regards the lower-case letters; and type very similar was used during the next fifteen or twenty years not only by Schoeffer,|especially as regards the lower-case letters; and type very similar was used during the next fifteen or twenty years not only by Schoeffer, -LJ001-0028|but by printers in Strasburg, Basle, Paris, Lubeck, and other cities.|but by printers in Strasburg, Basle, Paris, Lubeck, and other cities. -LJ001-0029|But though on the whole, except in Italy, Gothic letter was most often used|But though on the whole, except in Italy, Gothic letter was most often used -LJ001-0030|a very few years saw the birth of Roman character not only in Italy, but in Germany and France.|a very few years saw the birth of Roman character not only in Italy, but in Germany and France. -LJ001-0031|In 1465 Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome,|In fourteen sixty-five Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome, -LJ001-0032|and used an exceedingly beautiful type, which is indeed to look at a transition between Gothic and Roman,|and used an exceedingly beautiful type, which is indeed to look at a transition between Gothic and Roman, \ No newline at end of file diff --git a/tests/data/ljspeech/speakers.json b/tests/data/ljspeech/speakers.json new file mode 100644 index 00000000..915cff73 --- /dev/null +++ b/tests/data/ljspeech/speakers.json @@ -0,0 +1,2612 @@ +{ + "LJ001-0001.wav": { + "name": "ljspeech-0", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0002.wav": { + "name": "ljspeech-1", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0003.wav": { + "name": "ljspeech-2", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0004.wav": { + "name": "ljspeech-3", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0005.wav": { + "name": "ljspeech-4", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0006.wav": { + "name": "ljspeech-5", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0007.wav": { + "name": "ljspeech-6", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0008.wav": { + "name": "ljspeech-7", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0009.wav": { + "name": "ljspeech-8", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + }, + "LJ001-0010.wav": { + "name": "ljspeech-9", + "embedding": [ + 0.05539746582508087, + 0.08493061363697052, + -0.010013150051236153, + 0.04369359463453293, + -0.05871078372001648, + 0.07792330533266068, + -0.12001194059848785, + 0.09205509722232819, + -0.053687505424022675, + 0.13110113143920898, + -0.0672345906496048, + 0.09076011180877686, + -0.012022187933325768, + -0.1773194968700409, + -0.03690509498119354, + 0.052139587700366974, + -0.06511855870485306, + -0.014169753529131413, + -0.0788075178861618, + -0.022713735699653625, + 0.026002388447523117, + 0.04142642393708229, + 0.06633599102497101, + -0.040966324508190155, + 0.05216488242149353, + 0.043708473443984985, + 0.008947450667619705, + 0.043884553015232086, + 0.015242422930896282, + -0.07271697372198105, + -0.03943272680044174, + 0.11445401608943939, + -0.01976911909878254, + -0.001584329642355442, + 0.03226276487112045, + -0.002877067308872938, + 0.006218053866177797, + -0.09210439026355743, + -0.023884698748588562, + 0.019102394580841064, + -0.023189997300505638, + 0.07678322494029999, + 0.04511963576078415, + -0.028598245233297348, + 0.02654365450143814, + -0.026303084567189217, + -0.036059144884347916, + -0.04994352161884308, + -0.10899694263935089, + 0.16808779537677765, + 0.0568464957177639, + 0.017774248495697975, + -0.0766686350107193, + -0.08056356757879257, + 0.11318203061819077, + -0.0009237118065357208, + -0.11983267217874527, + -0.04011853411793709, + 0.06481920927762985, + 0.18528658151626587, + -0.020618144422769547, + 0.0030966848134994507, + 0.030582068488001823, + 0.11048240959644318, + 0.026203282177448273, + 0.08886025100946426, + 0.0776662528514862, + 0.08468905836343765, + 0.02009391225874424, + 0.053141623735427856, + 0.04102938249707222, + 0.059041380882263184, + -0.006237464025616646, + -0.018360337242484093, + 0.015418153256177902, + -0.03559226542711258, + -0.05805520713329315, + -0.00861218199133873, + -0.021234268322587013, + -0.025556275621056557, + -0.012332704849541187, + -0.009777471423149109, + 0.03721384331583977, + 0.010376224294304848, + -0.05210898444056511, + 0.035450324416160583, + 0.0026437342166900635, + -0.03329150378704071, + 0.07028764486312866, + 0.03101171739399433, + 0.003101848065853119, + 0.029428653419017792, + -0.03445912152528763, + -0.11992329359054565, + -0.006469260435551405, + 0.02472860924899578, + -0.0021879260893911123, + 0.06576769798994064, + 0.04159736633300781, + -0.044104330241680145, + 0.10868340730667114, + 0.06065361574292183, + -0.00814537052065134, + 0.029497724026441574, + -0.0820949599146843, + 0.09694784879684448, + 0.10299994796514511, + 0.007466038689017296, + 0.0573151595890522, + -0.04003140702843666, + 0.0748046338558197, + 0.07954449951648712, + -0.14061805605888367, + -0.07225356996059418, + 0.030713198706507683, + -0.01169175747781992, + 0.015277700498700142, + 0.101996049284935, + 0.0023796744644641876, + 0.013835912570357323, + 0.08836984634399414, + -0.08798637241125107, + -0.053786784410476685, + -0.025867177173495293, + 0.07090725004673004, + -0.05228910967707634, + 0.024839768186211586, + 0.0543626993894577, + -0.048099253326654434, + -0.01027676835656166, + 0.04654526337981224, + -0.0034045036882162094, + 0.003895972855389118, + 0.04250902682542801, + -0.05232023075222969, + 0.06287448853254318, + -0.04146592691540718, + -0.0022073618602007627, + 0.07169511169195175, + 0.057035692036151886, + 0.04202979430556297, + -0.01752091944217682, + -0.03615778684616089, + -0.07597745209932327, + 0.0076013305224478245, + 0.03388708084821701, + 0.06191568076610565, + -0.01607775315642357, + 0.004401837941259146, + -0.06070601940155029, + -0.07674850523471832, + 0.059249889105558395, + -0.02222420647740364, + 0.10215721279382706, + -0.000883960397914052, + 0.010600706562399864, + 0.09869417548179626, + 0.011313805356621742, + -0.01187396701425314, + -0.04851905256509781, + -0.020747501403093338, + 0.043711841106414795, + 0.04022590070962906, + -0.06653523445129395, + -0.04014153778553009, + 0.012923783622682095, + 0.0024894566740840673, + -0.03801071271300316, + 0.017412755638360977, + 0.03090047463774681, + 0.021060986444354057, + 0.04588426649570465, + -0.061013057827949524, + 0.022323710843920708, + -0.0921829417347908, + -0.009262383915483952, + -0.0024641728959977627, + -0.04311069846153259, + -0.02953970432281494, + 0.11183556914329529, + 0.041883185505867004, + 0.01362229697406292, + -0.009713159874081612, + -0.07398185133934021, + -0.03448636084794998, + 0.06774093955755234, + 0.06281304359436035, + 0.005423923954367638, + 0.04070146754384041, + 0.04723779857158661, + 0.0025808606296777725, + 0.04067641496658325, + 0.0840836763381958, + 0.0662192553281784, + 6.253225728869438e-05, + -0.03287994861602783, + -0.07941965758800507, + 0.09294897317886353, + 0.08651109039783478, + -0.09662938117980957, + -0.08838298916816711, + -0.05120178312063217, + -0.06626439094543457, + 0.04893879592418671, + -0.017820902168750763, + -0.007398976478725672, + 0.02896031364798546, + -0.025766948238015175, + -0.10214102268218994, + -0.10014186799526215, + 0.1211889386177063, + -0.0510331466794014, + -0.02461140602827072, + -0.06880723685026169, + 0.02751768007874489, + 0.07350686937570572, + 0.038249749690294266, + -0.009252945892512798, + 0.013650302775204182, + 0.04884907230734825, + -0.08785197138786316, + 0.003136417828500271, + 0.05015810579061508, + -0.00904669426381588, + -0.10715165734291077, + 0.026881497353315353, + -0.07288249582052231, + 0.08610662072896957, + -0.06228051334619522, + 0.1673828363418579, + 0.006395484320819378, + -0.0426831915974617, + -0.08067314326763153, + 0.06747708469629288, + -0.049200400710105896, + 0.0475490465760231, + 0.05716557055711746, + 0.060844384133815765, + 0.04086177423596382, + -0.08346255123615265, + 0.0869344025850296, + 0.019769223406910896, + -0.020300764590501785, + -0.0708683505654335, + -0.030514180660247803, + -0.027429744601249695, + 0.021853724494576454, + -0.012019682675600052, + -0.0613793209195137, + 0.009929075837135315, + 0.0261012464761734, + -0.018161576241254807, + 0.07936893403530121, + 0.12791746854782104, + 0.08958099782466888, + -0.09469571709632874 + ] + } +} diff --git a/tests/data_tests/__init__.py b/tests/data_tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/data_tests/test_dataset_formatters.py b/tests/data_tests/test_dataset_formatters.py index 968e2a29..bd83002c 100644 --- a/tests/data_tests/test_dataset_formatters.py +++ b/tests/data_tests/test_dataset_formatters.py @@ -2,7 +2,7 @@ import os import unittest from tests import get_tests_input_path -from TTS.tts.datasets.preprocess import common_voice +from TTS.tts.datasets.formatters import common_voice class TestPreprocessors(unittest.TestCase): diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index e2dba37a..9bc70ddd 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -9,7 +9,7 @@ from torch.utils.data import DataLoader from tests import get_tests_output_path from TTS.tts.configs import BaseTTSConfig from TTS.tts.datasets import TTSDataset -from TTS.tts.datasets.preprocess import ljspeech +from TTS.tts.datasets.formatters import ljspeech from TTS.utils.audio import AudioProcessor # pylint: disable=unused-variable @@ -38,13 +38,13 @@ class TestTTSDataset(unittest.TestCase): def _create_dataloader(self, batch_size, r, bgs): items = ljspeech(c.data_path, "metadata.csv") - dataset = TTSDataset.MyDataset( + dataset = TTSDataset( r, c.text_cleaner, compute_linear_spec=True, ap=self.ap, meta_data=items, - tp=c.characters, + characters=c.characters, batch_group_size=bgs, min_seq_len=c.min_seq_len, max_seq_len=float("inf"), diff --git a/tests/inference_tests/__init__.py b/tests/inference_tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/inference_tests/test_synthesize.py b/tests/inference_tests/test_synthesize.py index 62eb6dbe..526f7dc8 100644 --- a/tests/inference_tests/test_synthesize.py +++ b/tests/inference_tests/test_synthesize.py @@ -10,19 +10,19 @@ def test_synthesize(): # single speaker model run_cli(f'tts --text "This is an example." --out_path "{output_path}"') - # run_cli( - # "tts --model_name tts_models/en/ljspeech/speedy-speech-wn " - # f'--text "This is an example." --out_path "{output_path}"' - # ) - # run_cli( - # "tts --model_name tts_models/en/ljspeech/speedy-speech-wn " - # "--vocoder_name vocoder_models/en/ljspeech/multiband-melgan " - # f'--text "This is an example." --out_path "{output_path}"' - # ) + run_cli( + "tts --model_name tts_models/en/ljspeech/speedy-speech-wn " + f'--text "This is an example." --out_path "{output_path}"' + ) + run_cli( + "tts --model_name tts_models/en/ljspeech/speedy-speech-wn " + "--vocoder_name vocoder_models/en/ljspeech/multiband-melgan " + f'--text "This is an example." --out_path "{output_path}"' + ) - # # multi-speaker model - # run_cli("tts --model_name tts_models/en/vctk/sc-glow-tts --list_speaker_idxs") - # run_cli( - # f'tts --model_name tts_models/en/vctk/sc-glow-tts --speaker_idx "p304" ' - # f'--text "This is an example." --out_path "{output_path}"' - # ) + # multi-speaker model + run_cli("tts --model_name tts_models/en/vctk/sc-glow-tts --list_speaker_idxs") + run_cli( + f'tts --model_name tts_models/en/vctk/sc-glow-tts --speaker_idx "p304" ' + f'--text "This is an example." --out_path "{output_path}"' + ) diff --git a/tests/inference_tests/test_synthesizer.py b/tests/inference_tests/test_synthesizer.py index a1cd4de5..5972dc90 100644 --- a/tests/inference_tests/test_synthesizer.py +++ b/tests/inference_tests/test_synthesizer.py @@ -1,27 +1,22 @@ import os import unittest -from tests import get_tests_output_path from TTS.config import load_config -from TTS.tts.utils.generic_utils import setup_model -from TTS.tts.utils.io import save_checkpoint -from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols +from TTS.tts.models import setup_model +from TTS.utils.io import save_checkpoint from TTS.utils.synthesizer import Synthesizer +from .. import get_tests_output_path + class SynthesizerTest(unittest.TestCase): # pylint: disable=R0201 def _create_random_model(self): # pylint: disable=global-statement - global symbols, phonemes config = load_config(os.path.join(get_tests_output_path(), "dummy_model_config.json")) - if config.has("characters") and config.characters: - symbols, phonemes = make_symbols(**config.characters.to_dict()) - - num_chars = len(phonemes) if config.use_phonemes else len(symbols) - model = setup_model(num_chars, 0, config) + model = setup_model(config) output_path = os.path.join(get_tests_output_path()) - save_checkpoint(model, None, 10, 10, 1, output_path, None) + save_checkpoint(config, model, None, None, 10, 1, output_path) def test_in_out(self): self._create_random_model() diff --git a/tests/inputs/test_align_tts.json b/tests/inputs/test_align_tts.json index 964cc66d..a0d677ad 100644 --- a/tests/inputs/test_align_tts.json +++ b/tests/inputs/test_align_tts.json @@ -123,7 +123,7 @@ "text_cleaner": "english_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "min_seq_len": 2, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 300, // DATASET-RELATED: maximum text length @@ -140,8 +140,8 @@ // MULTI-SPEAKER and GST "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. - "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 - "external_speaker_embedding_file": "/home/erogol/Data/libritts/speakers.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "use_d_vector_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "d_vector_file": "/home/erogol/Data/libritts/speakers.json", // if not null and use_d_vector_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 // DATASETS diff --git a/tests/inputs/test_glow_tts.json b/tests/inputs/test_glow_tts.json index 64cc3822..6dd86057 100644 --- a/tests/inputs/test_glow_tts.json +++ b/tests/inputs/test_glow_tts.json @@ -115,7 +115,7 @@ "text_cleaner": "phoneme_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "min_seq_len": 3, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 500, // DATASET-RELATED: maximum text length @@ -132,8 +132,8 @@ "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages // MULTI-SPEAKER and GST - "use_external_speaker_embedding_file": false, - "external_speaker_embedding_file": null, + "use_d_vector_file": false, + "d_vector_file": null, "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. // DATASETS diff --git a/tests/inputs/test_speedy_speech.json b/tests/inputs/test_speedy_speech.json index a29fc992..02783d21 100644 --- a/tests/inputs/test_speedy_speech.json +++ b/tests/inputs/test_speedy_speech.json @@ -120,7 +120,7 @@ "text_cleaner": "english_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "min_seq_len": 2, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 300, // DATASET-RELATED: maximum text length @@ -137,8 +137,8 @@ // MULTI-SPEAKER and GST "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. - "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 - "external_speaker_embedding_file": "/home/erogol/Data/libritts/speakers.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "use_d_vector_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 + "d_vector_file": "/home/erogol/Data/libritts/speakers.json", // if not null and use_d_vector_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 // DATASETS diff --git a/tests/inputs/test_tacotron2_config.json b/tests/inputs/test_tacotron2_config.json index cc2c1bb5..6c82891d 100644 --- a/tests/inputs/test_tacotron2_config.json +++ b/tests/inputs/test_tacotron2_config.json @@ -130,7 +130,7 @@ "text_cleaner": "phoneme_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 153, // DATASET-RELATED: maximum text length @@ -145,8 +145,8 @@ "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages // MULTI-SPEAKER and GST - "use_external_speaker_embedding_file": false, - "external_speaker_embedding_file": null, + "use_d_vector_file": false, + "d_vector_file": null, "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. "use_gst": true, // use global style tokens "gst": { // gst parameter if gst is enabled diff --git a/tests/inputs/test_tacotron_bd_config.json b/tests/inputs/test_tacotron_bd_config.json index 9d2935aa..fbf3c001 100644 --- a/tests/inputs/test_tacotron_bd_config.json +++ b/tests/inputs/test_tacotron_bd_config.json @@ -130,7 +130,7 @@ "text_cleaner": "phoneme_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 153, // DATASET-RELATED: maximum text length @@ -145,8 +145,8 @@ "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages // MULTI-SPEAKER and GST - "use_external_speaker_embedding_file": false, - "external_speaker_embedding_file": null, + "use_d_vector_file": false, + "d_vector_file": null, "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. "use_gst": true, // use global style tokens "gst": { // gst parameter if gst is enabled diff --git a/tests/inputs/test_tacotron_config.json b/tests/inputs/test_tacotron_config.json index c8fae623..b60ed35e 100644 --- a/tests/inputs/test_tacotron_config.json +++ b/tests/inputs/test_tacotron_config.json @@ -130,7 +130,7 @@ "text_cleaner": "phoneme_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 153, // DATASET-RELATED: maximum text length @@ -145,8 +145,8 @@ "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages // MULTI-SPEAKER and GST - "use_external_speaker_embedding_file": false, - "external_speaker_embedding_file": null, + "use_d_vector_file": false, + "d_vector_file": null, "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. "use_gst": true, // use global style tokens "gst": { // gst parameter if gst is enabled diff --git a/tests/inputs/test_vocoder_multiband_melgan_config.json b/tests/inputs/test_vocoder_multiband_melgan_config.json index 794a3fcc..b8b192e4 100644 --- a/tests/inputs/test_vocoder_multiband_melgan_config.json +++ b/tests/inputs/test_vocoder_multiband_melgan_config.json @@ -157,7 +157,7 @@ // DATA LOADING "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "eval_split_size": 10, // PATHS diff --git a/tests/inputs/test_vocoder_wavegrad.json b/tests/inputs/test_vocoder_wavegrad.json index f6208e8d..6378c07a 100644 --- a/tests/inputs/test_vocoder_wavegrad.json +++ b/tests/inputs/test_vocoder_wavegrad.json @@ -88,7 +88,7 @@ // OPTIMIZER "epochs": 1, // total number of epochs to train. - "clip_grad": 1.0, // Generator gradient clipping threshold. Apply gradient clipping if > 0 + "grad_clip": 1.0, // Generator gradient clipping threshold. Apply gradient clipping if > 0 "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate "lr_scheduler_params": { "gamma": 0.5, @@ -107,7 +107,7 @@ // DATA LOADING "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "eval_split_size": 4, // PATHS diff --git a/tests/inputs/test_vocoder_wavernn_config.json b/tests/inputs/test_vocoder_wavernn_config.json index decafa70..ee4e5f8e 100644 --- a/tests/inputs/test_vocoder_wavernn_config.json +++ b/tests/inputs/test_vocoder_wavernn_config.json @@ -55,7 +55,7 @@ "padding": 2, // pad the input for resnet to see wider input length // GENERATOR - for backward compatibility - "generator_model": "WaveRNN", + "generator_model": "Wavernn", // DATASET //"use_gta": true, // use computed gta features from the tts model @@ -103,7 +103,7 @@ // DATA LOADING "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_eval_loader_workers": 0, // number of evaluation data loader processes. "eval_split_size": 10, // number of samples for testing // PATHS diff --git a/tests/test_extract_tts_spectrograms.py b/tests/test_extract_tts_spectrograms.py index 38cee473..8c795d58 100644 --- a/tests/test_extract_tts_spectrograms.py +++ b/tests/test_extract_tts_spectrograms.py @@ -5,8 +5,7 @@ import torch from tests import get_tests_input_path, get_tests_output_path, run_cli from TTS.config import load_config -from TTS.tts.utils.generic_utils import setup_model -from TTS.tts.utils.text.symbols import phonemes, symbols +from TTS.tts.models import setup_model torch.manual_seed(1) @@ -21,8 +20,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): # load config c = load_config(config_path) # create model - num_chars = len(phonemes if c.use_phonemes else symbols) - model = setup_model(num_chars, 1, c, speaker_embedding_dim=None) + model = setup_model(c) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test @@ -40,8 +38,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): # load config c = load_config(config_path) # create model - num_chars = len(phonemes if c.use_phonemes else symbols) - model = setup_model(num_chars, 1, c, speaker_embedding_dim=None) + model = setup_model(c) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test @@ -59,8 +56,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): # load config c = load_config(config_path) # create model - num_chars = len(phonemes if c.use_phonemes else symbols) - model = setup_model(num_chars, 1, c, speaker_embedding_dim=None) + model = setup_model(c) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test diff --git a/tests/test_speaker_encoder_train.py b/tests/test_speaker_encoder_train.py index 21b12074..4419a00f 100644 --- a/tests/test_speaker_encoder_train.py +++ b/tests/test_speaker_encoder_train.py @@ -6,7 +6,18 @@ from tests import get_device_id, get_tests_output_path, run_cli from TTS.config.shared_configs import BaseAudioConfig from TTS.speaker_encoder.speaker_encoder_config import SpeakerEncoderConfig -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") +def run_test_train(): + command = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.name ljspeech " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + ) + run_cli(command) + +config_path = os.path.join(get_tests_output_path(), "test_speaker_encoder_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") config = SpeakerEncoderConfig( @@ -24,16 +35,9 @@ config.audio.do_trim_silence = True config.audio.trim_db = 60 config.save_json(config_path) +print(config) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.name ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " -) -run_cli(command_train) +run_test_train() # Find latest folder continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) @@ -50,15 +54,7 @@ config.model_params["model_name"] = "resnet" config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} " - f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.name ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " -) -run_cli(command_train) +run_test_train() # Find latest folder continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) @@ -69,3 +65,18 @@ command_train = ( ) run_cli(command_train) shutil.rmtree(continue_path) + +# test model with ge2e loss function +config.loss = "ge2e" +config.save_json(config_path) +run_test_train() + +# test model with angleproto loss function +config.loss = "angleproto" +config.save_json(config_path) +run_test_train() + +# test model with softmaxproto loss function +config.loss = "softmaxproto" +config.save_json(config_path) +run_test_train() diff --git a/tests/test_speaker_manager.py b/tests/test_speaker_manager.py index f80e56fc..baa50749 100644 --- a/tests/test_speaker_manager.py +++ b/tests/test_speaker_manager.py @@ -15,11 +15,11 @@ encoder_config_path = os.path.join(get_tests_input_path(), "test_speaker_encoder encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth.tar") sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav") sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav") -x_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json") +d_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json") class SpeakerManagerTest(unittest.TestCase): - """Test SpeakerManager for loading embedding files and computing x_vectors from waveforms""" + """Test SpeakerManager for loading embedding files and computing d_vectors from waveforms""" @staticmethod def test_speaker_embedding(): @@ -38,38 +38,38 @@ class SpeakerManagerTest(unittest.TestCase): # load a sample audio and compute embedding waveform = ap.load_wav(sample_wav_path) mel = ap.melspectrogram(waveform) - x_vector = manager.compute_x_vector(mel.T) - assert x_vector.shape[1] == 256 + d_vector = manager.compute_d_vector(mel.T) + assert d_vector.shape[1] == 256 - # compute x_vector directly from an input file - x_vector = manager.compute_x_vector_from_clip(sample_wav_path) - x_vector2 = manager.compute_x_vector_from_clip(sample_wav_path) - x_vector = torch.FloatTensor(x_vector) - x_vector2 = torch.FloatTensor(x_vector2) - assert x_vector.shape[0] == 256 - assert (x_vector - x_vector2).sum() == 0.0 + # compute d_vector directly from an input file + d_vector = manager.compute_d_vector_from_clip(sample_wav_path) + d_vector2 = manager.compute_d_vector_from_clip(sample_wav_path) + d_vector = torch.FloatTensor(d_vector) + d_vector2 = torch.FloatTensor(d_vector2) + assert d_vector.shape[0] == 256 + assert (d_vector - d_vector2).sum() == 0.0 - # compute x_vector from a list of wav files. - x_vector3 = manager.compute_x_vector_from_clip([sample_wav_path, sample_wav_path2]) - x_vector3 = torch.FloatTensor(x_vector3) - assert x_vector3.shape[0] == 256 - assert (x_vector - x_vector3).sum() != 0.0 + # compute d_vector from a list of wav files. + d_vector3 = manager.compute_d_vector_from_clip([sample_wav_path, sample_wav_path2]) + d_vector3 = torch.FloatTensor(d_vector3) + assert d_vector3.shape[0] == 256 + assert (d_vector - d_vector3).sum() != 0.0 # remove dummy model os.remove(encoder_model_path) @staticmethod def test_speakers_file_processing(): - manager = SpeakerManager(x_vectors_file_path=x_vectors_file_path) + manager = SpeakerManager(d_vectors_file_path=d_vectors_file_path) print(manager.num_speakers) - print(manager.x_vector_dim) + print(manager.d_vector_dim) print(manager.clip_ids) - x_vector = manager.get_x_vector_by_clip(manager.clip_ids[0]) - assert len(x_vector) == 256 - x_vectors = manager.get_x_vectors_by_speaker(manager.speaker_ids[0]) - assert len(x_vectors[0]) == 256 - x_vector1 = manager.get_mean_x_vector(manager.speaker_ids[0], num_samples=2, randomize=True) - assert len(x_vector1) == 256 - x_vector2 = manager.get_mean_x_vector(manager.speaker_ids[0], num_samples=2, randomize=False) - assert len(x_vector2) == 256 - assert np.sum(np.array(x_vector1) - np.array(x_vector2)) != 0 + d_vector = manager.get_d_vector_by_clip(manager.clip_ids[0]) + assert len(d_vector) == 256 + d_vectors = manager.get_d_vectors_by_speaker(manager.speaker_names[0]) + assert len(d_vectors[0]) == 256 + d_vector1 = manager.get_mean_d_vector(manager.speaker_names[0], num_samples=2, randomize=True) + assert len(d_vector1) == 256 + d_vector2 = manager.get_mean_d_vector(manager.speaker_names[0], num_samples=2, randomize=False) + assert len(d_vector2) == 256 + assert np.sum(np.array(d_vector1) - np.array(d_vector2)) != 0 diff --git a/tests/text_tests/__init__.py b/tests/text_tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/tts_tests/test_align_tts_train.py b/tests/tts_tests/test_align_tts_train.py index 848f46c1..3700b1d3 100644 --- a/tests/tts_tests/test_align_tts_train.py +++ b/tests/tts_tests/test_align_tts_train.py @@ -13,7 +13,7 @@ config = AlignTTSConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=False, phoneme_language="en-us", @@ -23,6 +23,9 @@ config = AlignTTSConfig( epochs=1, print_step=1, print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], ) config.audio.do_trim_silence = True config.audio.trim_db = 60 @@ -30,12 +33,13 @@ config.save_json(config_path) # train the model for one epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_align_tts.py --config_path {config_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs -1" ) run_cli(command_train) @@ -43,8 +47,6 @@ run_cli(command_train) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_align_tts.py --continue_path {continue_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_feed_forward_layers.py b/tests/tts_tests/test_feed_forward_layers.py index 1db980a3..1c2d3803 100644 --- a/tests/tts_tests/test_feed_forward_layers.py +++ b/tests/tts_tests/test_feed_forward_layers.py @@ -2,7 +2,7 @@ import torch from TTS.tts.layers.feed_forward.decoder import Decoder from TTS.tts.layers.feed_forward.encoder import Encoder -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") diff --git a/tests/tts_tests/test_glow_tts.py b/tests/tts_tests/test_glow_tts.py index 486de274..171f2cdc 100644 --- a/tests/tts_tests/test_glow_tts.py +++ b/tests/tts_tests/test_glow_tts.py @@ -34,71 +34,18 @@ class GlowTTSTrainTest(unittest.TestCase): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, c.audio["num_mels"], 30).to(device) + mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) speaker_ids = torch.randint(0, 5, (8,)).long().to(device) criterion = GlowTTSLoss() # model to train - model = GlowTTS( - num_chars=32, - hidden_channels_enc=48, - hidden_channels_dec=48, - hidden_channels_dp=32, - out_channels=80, - encoder_type="rel_pos_transformer", - encoder_params={ - "kernel_size": 3, - "dropout_p": 0.1, - "num_layers": 6, - "num_heads": 2, - "hidden_channels_ffn": 16, # 4 times the hidden_channels - "input_length": None, - }, - use_encoder_prenet=True, - num_flow_blocks_dec=12, - kernel_size_dec=5, - dilation_rate=1, - num_block_layers=4, - dropout_p_dec=0.0, - num_speakers=0, - c_in_channels=0, - num_splits=4, - num_squeeze=1, - sigmoid_scale=False, - mean_only=False, - ).to(device) + config = GlowTTSConfig(num_chars=32) + model = GlowTTS(config).to(device) # reference model to compare model weights - model_ref = GlowTTS( - num_chars=32, - hidden_channels_enc=48, - hidden_channels_dec=48, - hidden_channels_dp=32, - out_channels=80, - encoder_type="rel_pos_transformer", - encoder_params={ - "kernel_size": 3, - "dropout_p": 0.1, - "num_layers": 6, - "num_heads": 2, - "hidden_channels_ffn": 16, # 4 times the hidden_channels - "input_length": None, - }, - use_encoder_prenet=True, - num_flow_blocks_dec=12, - kernel_size_dec=5, - dilation_rate=1, - num_block_layers=4, - dropout_p_dec=0.0, - num_speakers=0, - c_in_channels=0, - num_splits=4, - num_squeeze=1, - sigmoid_scale=False, - mean_only=False, - ).to(device) + model_ref = GlowTTS(config).to(device) model.train() print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) @@ -114,10 +61,17 @@ class GlowTTSTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=0.001) for _ in range(5): optimizer.zero_grad() - z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, None + outputs = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths, None) + loss_dict = criterion( + outputs["model_outputs"], + outputs["y_mean"], + outputs["y_log_scale"], + outputs["logdet"], + mel_lengths, + outputs["durations_log"], + outputs["total_durations_log"], + input_lengths, ) - loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, o_dur_log, o_total_dur, input_lengths) loss = loss_dict["loss"] loss.backward() optimizer.step() @@ -137,50 +91,24 @@ class GlowTTSInferenceTest(unittest.TestCase): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, c.audio["num_mels"], 30).to(device) + mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) speaker_ids = torch.randint(0, 5, (8,)).long().to(device) # create model - model = GlowTTS( - num_chars=32, - hidden_channels_enc=48, - hidden_channels_dec=48, - hidden_channels_dp=32, - out_channels=80, - encoder_type="rel_pos_transformer", - encoder_params={ - "kernel_size": 3, - "dropout_p": 0.1, - "num_layers": 6, - "num_heads": 2, - "hidden_channels_ffn": 16, # 4 times the hidden_channels - "input_length": None, - }, - use_encoder_prenet=True, - num_flow_blocks_dec=12, - kernel_size_dec=5, - dilation_rate=1, - num_block_layers=4, - dropout_p_dec=0.0, - num_speakers=0, - c_in_channels=0, - num_splits=4, - num_squeeze=1, - sigmoid_scale=False, - mean_only=False, - ).to(device) + config = GlowTTSConfig(num_chars=32) + model = GlowTTS(config).to(device) model.eval() print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) # inference encoder and decoder with MAS - y, *_ = model.inference_with_MAS(input_dummy, input_lengths, mel_spec, mel_lengths, None) + y = model.inference_with_MAS(input_dummy, input_lengths, mel_spec, mel_lengths) - y_dec, _ = model.decoder_inference(mel_spec, mel_lengths) + y2 = model.decoder_inference(mel_spec, mel_lengths) assert ( - y_dec.shape == y.shape + y2["model_outputs"].shape == y["model_outputs"].shape ), "Difference between the shapes of the glowTTS inference with MAS ({}) and the inference using only the decoder ({}) !!".format( - y.shape, y_dec.shape + y["model_outputs"].shape, y2["model_outputs"].shape ) diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py index e44f6365..24c5c4cf 100644 --- a/tests/tts_tests/test_glow_tts_train.py +++ b/tests/tts_tests/test_glow_tts_train.py @@ -13,7 +13,7 @@ config = GlowTTSConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=True, use_espeak_phonemes=True, @@ -24,6 +24,9 @@ config = GlowTTSConfig( epochs=1, print_step=1, print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], ) config.audio.do_trim_silence = True config.audio.trim_db = 60 @@ -31,13 +34,14 @@ config.save_json(config_path) # train the model for one epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_glow_tts.py --config_path {config_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt" + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" ) run_cli(command_train) @@ -45,8 +49,6 @@ run_cli(command_train) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_glow_tts.py --continue_path {continue_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_speedy_speech_layers.py b/tests/tts_tests/test_speedy_speech_layers.py index 3473769b..a5c481f1 100644 --- a/tests/tts_tests/test_speedy_speech_layers.py +++ b/tests/tts_tests/test_speedy_speech_layers.py @@ -1,8 +1,9 @@ import torch +from TTS.tts.configs import SpeedySpeechConfig from TTS.tts.layers.feed_forward.duration_predictor import DurationPredictor -from TTS.tts.models.speedy_speech import SpeedySpeech -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.models.speedy_speech import SpeedySpeech, SpeedySpeechArgs +from TTS.tts.utils.data import sequence_mask use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @@ -40,31 +41,56 @@ def test_speedy_speech(): y_lengths = durations.sum(1) - model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128) + config = SpeedySpeechConfig(model_args=SpeedySpeechArgs(num_chars=num_chars, out_channels=80, hidden_channels=128)) + model = SpeedySpeech(config) if use_cuda: model.cuda() # forward pass - o_de, o_dr, attn = model(x_dummy, x_lengths, y_lengths, durations) + outputs = model(x_dummy, x_lengths, y_lengths, durations) + o_de = outputs["model_outputs"] + attn = outputs["alignments"] + o_dr = outputs["durations_log"] - assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}" + assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}" assert list(attn.shape) == [B, T_de, T_en] assert list(o_dr.shape) == [B, T_en] # with speaker embedding - model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128, num_speakers=10, c_in_channels=256).to(device) - model.forward(x_dummy, x_lengths, y_lengths, durations, g=torch.randint(0, 10, (B,)).to(device)) + config = SpeedySpeechConfig( + model_args=SpeedySpeechArgs( + num_chars=num_chars, out_channels=80, hidden_channels=128, num_speakers=80, d_vector_dim=256 + ) + ) + model = SpeedySpeech(config).to(device) + model.forward( + x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.randint(0, 10, (B,)).to(device)} + ) + o_de = outputs["model_outputs"] + attn = outputs["alignments"] + o_dr = outputs["durations_log"] - assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}" + assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}" assert list(attn.shape) == [B, T_de, T_en] assert list(o_dr.shape) == [B, T_en] # with speaker external embedding - model = SpeedySpeech( - num_chars, out_channels=80, hidden_channels=128, num_speakers=10, external_c=True, c_in_channels=256 - ).to(device) - model.forward(x_dummy, x_lengths, y_lengths, durations, g=torch.rand((B, 256)).to(device)) + config = SpeedySpeechConfig( + model_args=SpeedySpeechArgs( + num_chars=num_chars, + out_channels=80, + hidden_channels=128, + num_speakers=10, + use_d_vector=True, + d_vector_dim=256, + ) + ) + model = SpeedySpeech(config).to(device) + model.forward(x_dummy, x_lengths, y_lengths, durations, aux_input={"d_vectors": torch.rand((B, 256)).to(device)}) + o_de = outputs["model_outputs"] + attn = outputs["alignments"] + o_dr = outputs["durations_log"] - assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}" + assert list(o_de.shape) == [B, T_de, 80], f"{list(o_de.shape)}" assert list(attn.shape) == [B, T_de, T_en] assert list(o_dr.shape) == [B, T_en] diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index 9dcf0ad8..28dc7029 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -4,16 +4,18 @@ import shutil from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs import SpeedySpeechConfig +from TTS.tts.models.speedy_speech import SpeedySpeechArgs config_path = os.path.join(get_tests_output_path(), "test_speedy_speech_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") config = SpeedySpeechConfig( + model_args=SpeedySpeechArgs(num_chars=50, out_channels=80, hidden_channels=128, num_speakers=0), batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=True, phoneme_language="en-us", @@ -23,6 +25,9 @@ config = SpeedySpeechConfig( epochs=1, print_step=1, print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], ) config.audio.do_trim_silence = True config.audio.trim_db = 60 @@ -30,13 +35,14 @@ config.save_json(config_path) # train the model for one epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_speedy_speech.py --config_path {config_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt" + "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt " + "--coqpit.test_delay_epochs 0" ) run_cli(command_train) @@ -44,8 +50,6 @@ run_cli(command_train) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_speedy_speech.py --continue_path {continue_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_tacotron2_d-vectors_train.py b/tests/tts_tests/test_tacotron2_d-vectors_train.py new file mode 100644 index 00000000..3313b8c4 --- /dev/null +++ b/tests/tts_tests/test_tacotron2_d-vectors_train.py @@ -0,0 +1,57 @@ +import glob +import os +import shutil + +from tests import get_device_id, get_tests_output_path, run_cli +from TTS.tts.configs import Tacotron2Config + +config_path = os.path.join(get_tests_output_path(), "test_model_config.json") +output_path = os.path.join(get_tests_output_path(), "train_outputs") + +config = Tacotron2Config( + r=5, + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + use_speaker_embedding=True, + use_d_vector_file=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + d_vector_file="tests/data/ljspeech/speakers.json", + max_decoder_steps=50, +) + +config.audio.do_trim_silence = True +config.audio.trim_db = 60 +config.save_json(config_path) + +# train the model for one epoch +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.name ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " +) +run_cli(command_train) + +# Find latest folder +continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + +# restore the model and continue training for one more epoch +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " +run_cli(command_train) +shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index 4d711700..a8132467 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -7,6 +7,7 @@ from torch import nn, optim from tests import get_tests_input_path from TTS.tts.configs import Tacotron2Config +from TTS.tts.configs.shared_configs import GSTConfig from TTS.tts.layers.losses import MSELossMasked from TTS.tts.models.tacotron2 import Tacotron2 from TTS.utils.audio import AudioProcessor @@ -17,19 +18,20 @@ torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -c = Tacotron2Config() +config_global = Tacotron2Config(num_chars=32, num_speakers=5, out_channels=80, decoder_output_dim=80) -ap = AudioProcessor(**c.audio) +ap = AudioProcessor(**config_global.audio) WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") class TacotronTrainTest(unittest.TestCase): def test_train_step(self): # pylint: disable=no-self-use + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -38,29 +40,29 @@ class TacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5).to(device) + model = Tacotron2(config).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for i in range(5): - mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} ) - assert torch.sigmoid(stop_tokens).data.max() <= 1.0 - assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -77,42 +79,44 @@ class TacotronTrainTest(unittest.TestCase): class MultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) - speaker_embeddings = torch.rand(8, 55).to(device) + speaker_ids = torch.rand(8, 55).to(device) for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, speaker_embedding_dim=55).to(device) + config.d_vector_dim = 55 + model = Tacotron2(config).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for i in range(5): - mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_ids} ) - assert torch.sigmoid(stop_tokens).data.max() <= 1.0 - assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -130,11 +134,12 @@ class TacotronGSTTrainTest(unittest.TestCase): # pylint: disable=no-self-use def test_train_step(self): # with random gst mel style + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -143,29 +148,31 @@ class TacotronGSTTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, use_gst=True, gst=c.gst).to(device) + config.use_gst = True + config.gst = GSTConfig() + model = Tacotron2(config).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for i in range(10): - mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} ) - assert torch.sigmoid(stop_tokens).data.max() <= 1.0 - assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -190,7 +197,7 @@ class TacotronGSTTrainTest(unittest.TestCase): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -199,29 +206,29 @@ class TacotronGSTTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, use_gst=True, gst=c.gst).to(device) + model = Tacotron2(config).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for i in range(10): - mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} ) - assert torch.sigmoid(stop_tokens).data.max() <= 1.0 - assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -242,11 +249,12 @@ class TacotronGSTTrainTest(unittest.TestCase): class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -255,30 +263,29 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, speaker_embedding_dim=55, use_gst=True, gst=c.gst).to( - device - ) + config.d_vector_dim = 55 + model = Tacotron2(config).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for i in range(5): - mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings} ) - assert torch.sigmoid(stop_tokens).data.max() <= 1.0 - assert torch.sigmoid(stop_tokens).data.min() >= 0.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 + assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(mel_postnet_out, mel_postnet_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes diff --git a/tests/tts_tests/test_tacotron2_speaker_emb_train.py b/tests/tts_tests/test_tacotron2_speaker_emb_train.py new file mode 100644 index 00000000..41d694f6 --- /dev/null +++ b/tests/tts_tests/test_tacotron2_speaker_emb_train.py @@ -0,0 +1,55 @@ +import glob +import os +import shutil + +from tests import get_device_id, get_tests_output_path, run_cli +from TTS.tts.configs import Tacotron2Config + +config_path = os.path.join(get_tests_output_path(), "test_model_config.json") +output_path = os.path.join(get_tests_output_path(), "train_outputs") + +config = Tacotron2Config( + r=5, + batch_size=8, + eval_batch_size=8, + num_loader_workers=0, + num_eval_loader_workers=0, + text_cleaner="english_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), + run_eval=True, + test_delay_epochs=-1, + epochs=1, + print_step=1, + print_eval=True, + test_sentences=[ + "Be a voice, not an echo.", + ], + use_speaker_embedding=True, + max_decoder_steps=50, +) + +config.audio.do_trim_silence = True +config.audio.trim_db = 60 +config.save_json(config_path) + +# train the model for one epoch +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " + f"--coqpit.output_path {output_path} " + "--coqpit.datasets.0.name ljspeech_test " + "--coqpit.datasets.0.meta_file_train metadata.csv " + "--coqpit.datasets.0.meta_file_val metadata.csv " + "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " +) +run_cli(command_train) + +# Find latest folder +continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) + +# restore the model and continue training for one more epoch +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " +run_cli(command_train) +shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_tacotron2_tf_model.py b/tests/tts_tests/test_tacotron2_tf_model.py index ee7f720b..431b0c2f 100644 --- a/tests/tts_tests/test_tacotron2_tf_model.py +++ b/tests/tts_tests/test_tacotron2_tf_model.py @@ -110,7 +110,7 @@ class TacotronTFTrainTest(unittest.TestCase): num_chars=24, num_speakers=0, r=3, - postnet_output_dim=80, + out_channels=80, decoder_output_dim=80, attn_type="original", attn_win=False, diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py index dbec309b..e947a54a 100644 --- a/tests/tts_tests/test_tacotron2_train.py +++ b/tests/tts_tests/test_tacotron2_train.py @@ -8,13 +8,12 @@ from TTS.tts.configs import Tacotron2Config config_path = os.path.join(get_tests_output_path(), "test_model_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") - config = Tacotron2Config( r=5, batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=False, phoneme_language="en-us", @@ -23,7 +22,11 @@ config = Tacotron2Config( test_delay_epochs=-1, epochs=1, print_step=1, + test_sentences=[ + "Be a voice, not an echo.", + ], print_eval=True, + max_decoder_steps=50, ) config.audio.do_trim_silence = True config.audio.trim_db = 60 @@ -31,12 +34,13 @@ config.save_json(config_path) # train the model for one epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tacotron.py --config_path {config_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0 " ) run_cli(command_train) @@ -44,8 +48,6 @@ run_cli(command_train) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tacotron.py --continue_path {continue_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_tacotron_layers.py b/tests/tts_tests/test_tacotron_layers.py index 9b89e645..783be0db 100644 --- a/tests/tts_tests/test_tacotron_layers.py +++ b/tests/tts_tests/test_tacotron_layers.py @@ -4,7 +4,7 @@ import torch as T from TTS.tts.layers.losses import L1LossMasked, SSIMLoss from TTS.tts.layers.tacotron.tacotron import CBHG, Decoder, Encoder, Prenet -from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.data import sequence_mask # pylint: disable=unused-variable @@ -61,6 +61,7 @@ class DecoderTests(unittest.TestCase): forward_attn_mask=True, location_attn=True, separate_stopnet=True, + max_decoder_steps=50, ) dummy_input = T.rand(4, 8, 256) dummy_memory = T.rand(4, 2, 80) diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py index fcbac0f7..6c673568 100644 --- a/tests/tts_tests/test_tacotron_model.py +++ b/tests/tts_tests/test_tacotron_model.py @@ -6,7 +6,7 @@ import torch from torch import nn, optim from tests import get_tests_input_path -from TTS.tts.configs import TacotronConfig +from TTS.tts.configs import GSTConfig, TacotronConfig from TTS.tts.layers.losses import L1LossMasked from TTS.tts.models.tacotron import Tacotron from TTS.utils.audio import AudioProcessor @@ -17,9 +17,9 @@ torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -c = TacotronConfig() +config_global = TacotronConfig(num_chars=32, num_speakers=5, out_channels=513, decoder_output_dim=80) -ap = AudioProcessor(**c.audio) +ap = AudioProcessor(**config_global.audio) WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") @@ -31,11 +31,12 @@ def count_parameters(model): class TacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[-1] = mel_spec.size(1) stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -44,21 +45,12 @@ class TacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron( - num_chars=32, - num_speakers=5, - postnet_output_dim=c.audio["fft_size"], - decoder_output_dim=c.audio["num_mels"], - r=c.r, - memory_size=c.memory_size, - ).to( - device - ) # FIXME: missing num_speakers parameter to Tacotron ctor + model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) model_ref = copy.deepcopy(model) @@ -66,15 +58,15 @@ class TacotronTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for _ in range(5): - mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} ) optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -91,11 +83,12 @@ class TacotronTrainTest(unittest.TestCase): class MultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[-1] = mel_spec.size(1) stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -104,22 +97,13 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron( - num_chars=32, - num_speakers=5, - postnet_output_dim=c.audio["fft_size"], - decoder_output_dim=c.audio["num_mels"], - r=c.r, - memory_size=c.memory_size, - speaker_embedding_dim=55, - ).to( - device - ) # FIXME: missing num_speakers parameter to Tacotron ctor + config.d_vector_dim = 55 + model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) model_ref = copy.deepcopy(model) @@ -127,15 +111,15 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for _ in range(5): - mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings} ) optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -152,12 +136,13 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): class TacotronGSTTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() # with random gst mel style input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 120, c.audio["num_mels"]).to(device) - linear_spec = torch.rand(8, 120, c.audio["fft_size"]).to(device) + mel_spec = torch.rand(8, 120, config.audio["num_mels"]).to(device) + linear_spec = torch.rand(8, 120, config.audio["fft_size"] // 2 + 1).to(device) mel_lengths = torch.randint(20, 120, (8,)).long().to(device) mel_lengths[-1] = 120 stop_targets = torch.zeros(8, 120, 1).float().to(device) @@ -166,23 +151,14 @@ class TacotronGSTTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron( - num_chars=32, - num_speakers=5, - use_gst=True, - gst=c.gst, - postnet_output_dim=c.audio["fft_size"], - decoder_output_dim=c.audio["num_mels"], - r=c.r, - memory_size=c.memory_size, - ).to( - device - ) # FIXME: missing num_speakers parameter to Tacotron ctor + config.use_gst = True + config.gst = GSTConfig() + model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() # print(model) print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model))) @@ -191,15 +167,15 @@ class TacotronGSTTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for _ in range(10): - mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} ) optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -220,7 +196,7 @@ class TacotronGSTTrainTest(unittest.TestCase): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - linear_spec = torch.rand(8, mel_spec.size(1), c.audio["fft_size"]).to(device) + linear_spec = torch.rand(8, mel_spec.size(1), config.audio["fft_size"] // 2 + 1).to(device) mel_lengths = torch.randint(20, mel_spec.size(1), (8,)).long().to(device) mel_lengths[-1] = mel_spec.size(1) stop_targets = torch.zeros(8, mel_spec.size(1), 1).float().to(device) @@ -229,23 +205,12 @@ class TacotronGSTTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron( - num_chars=32, - num_speakers=5, - use_gst=True, - gst=c.gst, - postnet_output_dim=c.audio["fft_size"], - decoder_output_dim=c.audio["num_mels"], - r=c.r, - memory_size=c.memory_size, - ).to( - device - ) # FIXME: missing num_speakers parameter to Tacotron ctor + model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() # print(model) print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model))) @@ -254,15 +219,15 @@ class TacotronGSTTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for _ in range(10): - mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"speaker_ids": speaker_ids} ) optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes @@ -278,11 +243,12 @@ class TacotronGSTTrainTest(unittest.TestCase): class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): + config = config_global.copy() input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) - linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device) + mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device) + linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device) mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[-1] = mel_spec.size(1) stop_targets = torch.zeros(8, 30, 1).float().to(device) @@ -291,24 +257,13 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): for idx in mel_lengths: stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron( - num_chars=32, - num_speakers=5, - postnet_output_dim=c.audio["fft_size"], - decoder_output_dim=c.audio["num_mels"], - use_gst=True, - gst=c.gst, - r=c.r, - memory_size=c.memory_size, - speaker_embedding_dim=55, - ).to( - device - ) # FIXME: missing num_speakers parameter to Tacotron ctor + config.d_vector_dim = 55 + model = Tacotron(config).to(device) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) model_ref = copy.deepcopy(model) @@ -316,15 +271,15 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 - optimizer = optim.Adam(model.parameters(), lr=c.lr) + optimizer = optim.Adam(model.parameters(), lr=config.lr) for _ in range(5): - mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings + outputs = model.forward( + input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings} ) optimizer.zero_grad() - loss = criterion(mel_out, mel_spec, mel_lengths) - stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss + loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) + stop_loss = criterion_st(outputs["stop_tokens"], stop_targets) + loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes diff --git a/tests/tts_tests/test_tacotron_train.py b/tests/tts_tests/test_tacotron_train.py index 34ee6e06..0c35ee28 100644 --- a/tests/tts_tests/test_tacotron_train.py +++ b/tests/tts_tests/test_tacotron_train.py @@ -13,7 +13,7 @@ config = TacotronConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=False, phoneme_language="en-us", @@ -22,7 +22,12 @@ config = TacotronConfig( test_delay_epochs=-1, epochs=1, print_step=1, + test_sentences=[ + "Be a voice, not an echo.", + ], print_eval=True, + r=5, + max_decoder_steps=50, ) config.audio.do_trim_silence = True config.audio.trim_db = 60 @@ -30,12 +35,13 @@ config.save_json(config_path) # train the model for one epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tacotron.py --config_path {config_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " f"--coqpit.output_path {output_path} " "--coqpit.datasets.0.name ljspeech " "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech " + "--coqpit.test_delay_epochs 0" ) run_cli(command_train) @@ -43,8 +49,6 @@ run_cli(command_train) continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tacotron.py --continue_path {continue_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_fullband_melgan_train.py b/tests/vocoder_tests/test_fullband_melgan_train.py index 2b286b91..9d4e1933 100644 --- a/tests/vocoder_tests/test_fullband_melgan_train.py +++ b/tests/vocoder_tests/test_fullband_melgan_train.py @@ -12,7 +12,7 @@ config = FullbandMelganConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -20,8 +20,8 @@ config = FullbandMelganConfig( eval_split_size=1, print_step=1, print_eval=True, - discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, data_path="tests/data/ljspeech", + discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, output_path=output_path, ) config.audio.do_trim_silence = True @@ -29,9 +29,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_hifigan_train.py b/tests/vocoder_tests/test_hifigan_train.py index 11057570..c506fb48 100644 --- a/tests/vocoder_tests/test_hifigan_train.py +++ b/tests/vocoder_tests/test_hifigan_train.py @@ -13,7 +13,7 @@ config = HifiganConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -29,9 +29,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_melgan_train.py b/tests/vocoder_tests/test_melgan_train.py index 3ff65b5a..6ef9cd49 100644 --- a/tests/vocoder_tests/test_melgan_train.py +++ b/tests/vocoder_tests/test_melgan_train.py @@ -12,14 +12,14 @@ config = MelganConfig( batch_size=4, eval_batch_size=4, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, seq_len=2048, eval_split_size=1, print_step=1, - discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]}, + discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, print_eval=True, data_path="tests/data/ljspeech", output_path=output_path, @@ -29,9 +29,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_multiband_melgan_train.py b/tests/vocoder_tests/test_multiband_melgan_train.py index ef362414..c49107bd 100644 --- a/tests/vocoder_tests/test_multiband_melgan_train.py +++ b/tests/vocoder_tests/test_multiband_melgan_train.py @@ -12,7 +12,7 @@ config = MultibandMelganConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -20,8 +20,8 @@ config = MultibandMelganConfig( eval_split_size=1, print_step=1, print_eval=True, - discriminator_model_params={"base_channels": 16, "max_channels": 128, "downsample_factors": [4, 4, 4]}, data_path="tests/data/ljspeech", + discriminator_model_params={"base_channels": 16, "max_channels": 64, "downsample_factors": [4, 4, 4]}, output_path=output_path, ) config.audio.do_trim_silence = True @@ -29,9 +29,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -39,7 +37,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_parallel_wavegan_train.py b/tests/vocoder_tests/test_parallel_wavegan_train.py index fb6ea87c..a126befe 100644 --- a/tests/vocoder_tests/test_parallel_wavegan_train.py +++ b/tests/vocoder_tests/test_parallel_wavegan_train.py @@ -12,7 +12,7 @@ config = ParallelWaveganConfig( batch_size=4, eval_batch_size=4, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -28,9 +28,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -38,7 +36,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_gan.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_vocoder_wavernn.py b/tests/vocoder_tests/test_vocoder_wavernn.py index 9c58fa1c..b5c769ee 100644 --- a/tests/vocoder_tests/test_vocoder_wavernn.py +++ b/tests/vocoder_tests/test_vocoder_wavernn.py @@ -3,11 +3,13 @@ import random import numpy as np import torch -from TTS.vocoder.models.wavernn import WaveRNN +from TTS.vocoder.configs import WavernnConfig +from TTS.vocoder.models.wavernn import Wavernn, WavernnArgs def test_wavernn(): - model = WaveRNN( + config = WavernnConfig() + config.model_args = WavernnArgs( rnn_dims=512, fc_dims=512, mode=10, @@ -20,14 +22,30 @@ def test_wavernn(): compute_dims=128, res_out_dims=128, num_res_blocks=10, - hop_length=256, - sample_rate=22050, ) + config.audio.hop_length = 256 + config.audio.sample_rate = 2048 + dummy_x = torch.rand((2, 1280)) dummy_m = torch.rand((2, 80, 9)) y_size = random.randrange(20, 60) dummy_y = torch.rand((80, y_size)) + + # mode: mold + model = Wavernn(config) output = model(dummy_x, dummy_m) - assert np.all(output.shape == (2, 1280, 4 * 256)), output.shape + assert np.all(output.shape == (2, 1280, 30)), output.shape + + # mode: gauss + config.model_params.mode = "gauss" + model = Wavernn(config) + output = model(dummy_x, dummy_m) + assert np.all(output.shape == (2, 1280, 2)), output.shape + + # mode: quantized + config.model_params.mode = 4 + model = Wavernn(config) + output = model(dummy_x, dummy_m) + assert np.all(output.shape == (2, 1280, 2 ** 4)), output.shape output = model.inference(dummy_y, True, 5500, 550) assert np.all(output.shape == (256 * (y_size - 1),)) diff --git a/tests/vocoder_tests/test_wavegrad.py b/tests/vocoder_tests/test_wavegrad.py index a28409e5..43b5f080 100644 --- a/tests/vocoder_tests/test_wavegrad.py +++ b/tests/vocoder_tests/test_wavegrad.py @@ -4,7 +4,8 @@ import numpy as np import torch from torch import optim -from TTS.vocoder.models.wavegrad import Wavegrad +from TTS.vocoder.configs import WavegradConfig +from TTS.vocoder.models.wavegrad import Wavegrad, WavegradArgs # pylint: disable=unused-variable @@ -20,19 +21,16 @@ class WavegradTrainTest(unittest.TestCase): mel_spec = torch.rand(8, 80, 20).to(device) criterion = torch.nn.L1Loss().to(device) - model = Wavegrad( + args = WavegradArgs( in_channels=80, out_channels=1, upsample_factors=[5, 5, 3, 2, 2], upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], ) + config = WavegradConfig(model_params=args) + model = Wavegrad(config) - model_ref = Wavegrad( - in_channels=80, - out_channels=1, - upsample_factors=[5, 5, 3, 2, 2], - upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], - ) + model_ref = Wavegrad(config) model.train() model.to(device) betas = np.linspace(1e-6, 1e-2, 1000) diff --git a/tests/vocoder_tests/test_wavegrad_layers.py b/tests/vocoder_tests/test_wavegrad_layers.py index 0180eb0a..a0b021dc 100644 --- a/tests/vocoder_tests/test_wavegrad_layers.py +++ b/tests/vocoder_tests/test_wavegrad_layers.py @@ -1,7 +1,8 @@ import torch +from TTS.vocoder.configs import WavegradConfig from TTS.vocoder.layers.wavegrad import DBlock, FiLM, PositionalEncoding, UBlock -from TTS.vocoder.models.wavegrad import Wavegrad +from TTS.vocoder.models.wavegrad import Wavegrad, WavegradArgs def test_positional_encoding(): @@ -75,12 +76,14 @@ def test_wavegrad_forward(): c = torch.rand(32, 80, 20) noise_scale = torch.rand(32) - model = Wavegrad( + args = WavegradArgs( in_channels=80, out_channels=1, upsample_factors=[5, 5, 3, 2, 2], upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], ) + config = WavegradConfig(model_params=args) + model = Wavegrad(config) o = model.forward(x, c, noise_scale) assert o.shape[0] == 32 diff --git a/tests/vocoder_tests/test_wavegrad_train.py b/tests/vocoder_tests/test_wavegrad_train.py index e222de3a..fe56ee78 100644 --- a/tests/vocoder_tests/test_wavegrad_train.py +++ b/tests/vocoder_tests/test_wavegrad_train.py @@ -12,7 +12,7 @@ config = WavegradConfig( batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -29,15 +29,15 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavegrad.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) # restore the model and continue training for one more epoch -command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavegrad.py --continue_path {continue_path} " +command_train = ( + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " +) run_cli(command_train) shutil.rmtree(continue_path) diff --git a/tests/vocoder_tests/test_wavernn_train.py b/tests/vocoder_tests/test_wavernn_train.py index 414ed719..43fc5fb1 100644 --- a/tests/vocoder_tests/test_wavernn_train.py +++ b/tests/vocoder_tests/test_wavernn_train.py @@ -4,15 +4,18 @@ import shutil from tests import get_device_id, get_tests_output_path, run_cli from TTS.vocoder.configs import WavernnConfig +from TTS.vocoder.models.wavernn import WavernnArgs config_path = os.path.join(get_tests_output_path(), "test_vocoder_config.json") output_path = os.path.join(get_tests_output_path(), "train_outputs") + config = WavernnConfig( + model_params=WavernnArgs(), batch_size=8, eval_batch_size=8, num_loader_workers=0, - num_val_loader_workers=0, + num_eval_loader_workers=0, run_eval=True, test_delay_epochs=-1, epochs=1, @@ -28,9 +31,7 @@ config.audio.trim_db = 60 config.save_json(config_path) # train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavernn.py --config_path {config_path} " -) +command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --config_path {config_path} " run_cli(command_train) # Find latest folder @@ -38,7 +39,7 @@ continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getm # restore the model and continue training for one more epoch command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder_wavernn.py --continue_path {continue_path} " + f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_vocoder.py --continue_path {continue_path} " ) run_cli(command_train) shutil.rmtree(continue_path)