diff --git a/.compute b/.compute
index 4df52e0f..65a91011 100644
--- a/.compute
+++ b/.compute
@@ -1,7 +1,16 @@
#!/bin/bash
-ls ${SHARED_DIR}/data/keithito
-pip3 install https://download.pytorch.org/whl/cu100/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl
+yes | apt-get install sox
+yes | apt-get install ffmpeg
yes | apt-get install espeak
+yes | apt-get install tmux
+yes | apt-get install zsh
+pip3 install https://download.pytorch.org/whl/cu100/torch-1.1.0-cp37-cp37m-linux_x86_64.whl
+# wget https://www.dropbox.com/s/m8waow6b3ydpf6h/MozillaDataset.tar.gz?dl=0 -O /data/rw/home/mozilla.tar
+wget https://www.dropbox.com/s/wqn5v3wkktw9lmo/install.sh?dl=0 -O install.sh
+sudo sh install.sh
python3 setup.py develop
-# python3 distribute.py --config_path config_cluster.json --data_path ${SHARED_DIR}/data/keithito/LJSpeech-1.1/ --restore_path ${USER_DIR}/best_model.pth.tar
-python3 train.py --config_path config.json --data_path ${SHARED_DIR}/data/keithito/LJSpeech-1.1/ --output_path ../keep/ --restore_path ${USER_DIR}/best_model_by_tilman.pth.tar
\ No newline at end of file
+# cp -R ${USER_DIR}/Mozilla_22050 ../tmp/
+cp -R ${USER_DIR}/GermanData ../tmp/
+python3 distribute.py --config_path config_tacotron.json --data_path ../tmp/GermanData/karlsson/
+# python3 distribute.py --config_path config_cluster.json --data_path ${SHARED_DIR}/data/mozilla/Judy/
+while true; do sleep 1000000; done
diff --git a/.gitignore b/.gitignore
index bfc8ea1a..b0fe0bee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+.idea/
*.pyc
.DS_Store
./__init__.py
diff --git a/README.md b/README.md
index 2e47f24d..b8d82408 100644
--- a/README.md
+++ b/README.md
@@ -3,9 +3,14 @@
This project is a part of [Mozilla Common Voice](https://voice.mozilla.org/en). TTS aims a deep learning based Text2Speech engine, low in cost and high in quality. To begin with, you can hear a sample generated voice from [here](https://soundcloud.com/user-565970875/commonvoice-loc-sens-attn).
-The model architecture is highly inspired by Tacotron: [A Fully End-to-End Text-To-Speech Synthesis Model](https://arxiv.org/abs/1703.10135). However, it has many important updates that make training faster and computationally very efficient. Feel free to experiment with new ideas and propose changes.
+TTS includes two different model implementations which are based on [Tacotron](https://arxiv.org/abs/1703.10135) and [Tacotron2](https://arxiv.org/abs/1712.05884). Tacotron is smaller, efficient and easier to train but Tacotron2 provides better results, especially when it is combined with a Neural vocoder. Therefore, choose depending on your project requirements.
-You can find [here](http://www.erogol.com/text-speech-deep-learning-architectures/) a brief note about TTS architectures and their comparisons.
+If you are new, you can also find [here](http://www.erogol.com/text-speech-deep-learning-architectures/) a brief post about TTS architectures and their comparisons.
+
+## TTS Performance
+
+
+[Details...](https://github.com/mozilla/TTS/issues/186)
## Requirements and Installation
Highly recommended to use [miniconda](https://conda.io/miniconda.html) for easier installation.
@@ -38,41 +43,43 @@ Check out [here](https://mycroft.ai/blog/available-voices/#the-human-voice-is-th
| Models |Dataset | Commit | Audio Sample | Details |
| ------------- |:------:|:-----------------:|:--------------|:--------|
-| [iter-62410](https://drive.google.com/open?id=1pjJNzENL3ZNps9n7k_ktGbpEl6YPIkcZ)|LJSpeech| [99d56f7](https://github.com/mozilla/TTS/tree/99d56f7e93ccd7567beb0af8fcbd4d24c48e59e9) | [link](https://soundcloud.com/user-565970875/99d56f7-iter62410 )|First model with plain Tacotron implementation.|
-| [iter-170K](https://drive.google.com/open?id=16L6JbPXj6MSlNUxEStNn28GiSzi4fu1j) |LJSpeech| [e00bc66](https://github.com/mozilla/TTS/tree/e00bc66) |[link](https://soundcloud.com/user-565970875/april-13-2018-07-06pm-e00bc66-iter170k)|More stable and longer trained model.|
-| [iter-270K](https://drive.google.com/drive/folders/1Q6BKeEkZyxSGsocK2p_mqgzLwlNvbHFJ?usp=sharing)|LJSpeech|[256ed63](https://github.com/mozilla/TTS/tree/256ed63)|[link](https://soundcloud.com/user-565970875/sets/samples-1650226)|Stop-Token prediction is added, to detect end of speech.|
-| [iter-120K](https://drive.google.com/open?id=1A5Hr6aSvfGgIiE20mBkpzyn3vvbR2APj) |LJSpeech| [bf7590](https://github.com/mozilla/TTS/tree/bf7590) | [link](https://soundcloud.com/user-565970875/sets/september-26-2018-bf7590) | Better for longer sentences |
-|[iter-108K](https://drive.google.com/open?id=1deQ2akq9cuyreda0DgZOiBdydkbgseWP)| TWEB | [2810d57](https://github.com/mozilla/TTS/tree/2810d57) | [link](https://soundcloud.com/user-565970875/tweb-example-108k-iters-2810d57) | https://github.com/mozilla/TTS/issues/22 |
-| Best: [iter-185K](https://drive.google.com/drive/folders/1GU8WGix98WrR3ayjoiirmmbLUZzwg4n0?usp=sharing) | LJSpeech | [db7f3d3](https://github.com/mozilla/TTS/tree/db7f3d3) | [link](https://soundcloud.com/user-565970875/sets/ljspeech-model-185k-iters-commit-db7f3d3) | [link](https://github.com/mozilla/TTS/issues/108) |
+| [Tacotron-iter-62410](https://drive.google.com/open?id=1pjJNzENL3ZNps9n7k_ktGbpEl6YPIkcZ)|LJSpeech| [99d56f7](https://github.com/mozilla/TTS/tree/99d56f7e93ccd7567beb0af8fcbd4d24c48e59e9) | [link](https://soundcloud.com/user-565970875/99d56f7-iter62410 )|First model with plain Tacotron implementation.|
+| [Tacotron-iter-170K](https://drive.google.com/open?id=16L6JbPXj6MSlNUxEStNn28GiSzi4fu1j) |LJSpeech| [e00bc66](https://github.com/mozilla/TTS/tree/e00bc66) |[link](https://soundcloud.com/user-565970875/april-13-2018-07-06pm-e00bc66-iter170k)|More stable and longer trained model.|
+| [Tacotron-iter-270K](https://drive.google.com/drive/folders/1Q6BKeEkZyxSGsocK2p_mqgzLwlNvbHFJ?usp=sharing)|LJSpeech|[256ed63](https://github.com/mozilla/TTS/tree/256ed63)|[link](https://soundcloud.com/user-565970875/sets/samples-1650226)|Stop-Token prediction is added, to detect end of speech.|
+| [Tacotron-iter-120K](https://drive.google.com/open?id=1A5Hr6aSvfGgIiE20mBkpzyn3vvbR2APj) |LJSpeech| [bf7590](https://github.com/mozilla/TTS/tree/bf7590) | [link](https://soundcloud.com/user-565970875/sets/september-26-2018-bf7590) | Better for longer sentences |
+|[Tacotron-iter-108K](https://drive.google.com/open?id=1deQ2akq9cuyreda0DgZOiBdydkbgseWP)| TWEB | [2810d57](https://github.com/mozilla/TTS/tree/2810d57) | [link](https://soundcloud.com/user-565970875/tweb-example-108k-iters-2810d57) | https://github.com/mozilla/TTS/issues/22 |
+|[Tacotron-iter-185K](https://drive.google.com/drive/folders/1GU8WGix98WrR3ayjoiirmmbLUZzwg4n0?usp=sharing) | LJSpeech | [db7f3d3](https://github.com/mozilla/TTS/tree/db7f3d3) | [link](https://soundcloud.com/user-565970875/sets/ljspeech-model-185k-iters-commit-db7f3d3) | [link](https://github.com/mozilla/TTS/issues/108) |
+|[Tacotron2-iter-260K](https://drive.google.com/open?id=1FJRjGDAqWIyZRX4CsppaIPEW8UWXCWzF)|LJSpeech|[824c091](https://github.com/mozilla/TTS/tree/824c091)|[soundcloud](https://soundcloud.com/user-565970875/ljspeech-logistic-wavernn)|[link](https://github.com/mozilla/TTS/issues/153)|
## Example Model Outputs
-Below you see model state after 16K iterations with batch-size 32.
+Below you see Tacotron model state after 16K iterations with batch-size 32 with LJSpeech dataset.
> "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning."
-Audio output: [https://soundcloud.com/user-565970875/iter16k-f48c3b](https://soundcloud.com/user-565970875/iter16k-f48c3b)
+Audio examples: [https://soundcloud.com/user-565970875](https://soundcloud.com/user-565970875)

## Runtime
-The most time-consuming part is the vocoder algorithm (Griffin-Lim) which runs on CPU. By setting its number of iterations, you might have faster execution with a small loss of quality. Some of the experimental values are below.
+The most time-consuming part is the vocoder algorithm (Griffin-Lim) which runs on CPU. By setting its number of iterations lower, you might have faster execution with a small loss of quality. Some of the experimental values are below.
Sentence: "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."
Audio length is approximately 6 secs.
-| Time (secs) | System | # GL iters |
-| ---- |:-------|:-----------|
-|2.00|GTX1080Ti|30|
-|3.01|GTX1080Ti|60|
+| Time (secs) | System | # GL iters | Model
+| ---- |:-------|:-----------| ---- |
+|2.00|GTX1080Ti|30|Tacotron|
+|3.01|GTX1080Ti|60|Tacotron|
+|3.57|CPU|60|Tacotron|
+|5.27|GTX1080Ti|60|Tacotron2|
+|6.50|CPU|60|Tacotron2|
## Datasets and Data-Loading
-TTS provides a generic dataloder easy to use for new datasets. You need to write an adaptor to format and that's all you need.Check ```datasets/preprocess.py``` to see example adaptors. After you wrote an adaptor, you need to set ```dataset``` field in ```config.json```. Do not forget other data related fields.
+TTS provides a generic dataloder easy to use for new datasets. You need to write an preprocessor function to integrade your own dataset.Check ```datasets/preprocess.py``` to see some examples. After the function, you need to set ```dataset``` field in ```config.json```. Do not forget other data related fields too.
-You can also use pre-computed features. In this case, compute features with ```extract_features.py``` and set ```dataset``` field as ```tts_cache```.
-
-Example datasets, we successfully applied TTS, are linked below.
+Some of the open-sourced datasets that we successfully applied TTS, are linked below.
- [LJ Speech](https://keithito.com/LJ-Speech-Dataset/)
- [Nancy](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/)
@@ -80,9 +87,9 @@ Example datasets, we successfully applied TTS, are linked below.
- [M-AI-Labs](http://www.caito.de/2019/01/the-m-ailabs-speech-dataset/)
## Training and Fine-tuning LJ-Speech
-[Click Here](https://gist.github.com/erogol/97516ad65b44dbddb8cd694953187c5b) for hands-on **Notebook example**, training LJSpeech.
+Here you can find a [CoLab](https://gist.github.com/erogol/97516ad65b44dbddb8cd694953187c5b) notebook for a hands-on example, training LJSpeech. Or you can manually follow the guideline below.
-Split ```metadata.csv``` into train and validation subsets respectively ```metadata_train.csv``` and ```metadata_val.csv```. Note that having a validation split does not work well as oppose to other ML problems since at the validation time model generates spectrogram slices without "Teacher-Forcing" and that leads misalignment between the ground-truth and the prediction. Therefore, validation loss does not really show the model performance. Rather, you might use all data for training and check the model performance by relying on human inspection.
+To start with, split ```metadata.csv``` into train and validation subsets respectively ```metadata_train.csv``` and ```metadata_val.csv```. Note that for text-to-speech, validation performance might be misleading since the loss value does not directly measure the voice quality to the human ear and it also does not measure the attention module performance. Therefore, running the model with new sentences and listenning the results is the best way to go.
```
shuf metadata.csv > metadata_shuf.csv
@@ -90,7 +97,7 @@ head -n 12000 metadata_shuf.csv > metadata_train.csv
tail -n 1100 metadata_shuf.csv > metadata_val.csv
```
-To train a new model, you need to define your own ```config.json``` file (check the example) and call with the command below.
+To train a new model, you need to define your own ```config.json``` file (check the example) and call with the command below. You also set the model architecture in ```config.json```.
```train.py --config_path config.json```
@@ -106,14 +113,20 @@ Each run creates a new output folder and ```config.json``` is copied under this
In case of any error or intercepted execution, if there is no checkpoint yet under the output folder, the whole folder is going to be removed.
-You can also enjoy Tensorboard, if you point the Tensorboard argument```--logdir``` to the experiment folder.
+You can also enjoy Tensorboard, if you point Tensorboard argument```--logdir``` to the experiment folder.
## Testing
Best way to test your network is to use Notebooks under ```notebooks``` folder.
-## What is new with TTS
-If you train TTS with LJSpeech dataset, you start to hear reasonable results after 12.5K iterations with batch size 32. This is the fastest training with character-based methods up to our knowledge. Out implementation is also quite robust against long sentences.
+## Contact/Getting Help
+- [Wiki](https://github.com/mozilla/TTS/wiki)
+- [Discourse Forums](https://discourse.mozilla.org/c/tts) - If your question is not addressed in the Wiki, the Discourse Forums is the next place to look. They contain conversations on General Topics, Using TTS, and TTS Development.
+
+- [Issues](https://github.com/mozilla/TTS/issues) - Finally, if all else fails, you can open an issue in our repo.
+
+ r=1) by only changing the last layer. For instance, you can train the model with r=5 and then fine-tune it with r=1 without any performance loss. It also solves well-known PreNet problem [#50](https://github.com/mozilla/TTS/issues/50).
+- Constant history window. Instead of using only the last frame of predictions, define a constant history queue. It enables training with gradually decreasing prediction frame (r=5 -> r=1) by only changing the last layer. For instance, you can train the model with r=5 and then fine-tune it with r=1 without any performance loss. It also solves well-known PreNet problem [#50](https://github.com/mozilla/TTS/issues/50).
- Initialization of hidden decoder states with Embedding layers instead of zero initialization.
-
One common question is to ask why we don't use Tacotron2 architecture. According to our ablation experiments, nothing, except Location Sensitive Attention, improves the performance, given the increase in the model size.
-
Please feel free to offer new changes and pull things off. We are happy to discuss and make things better.
-
-## Problems waiting to be solved.
-- Punctuations at the end of a sentence sometimes affect the pronunciation of the last word. Because punctuation sign is attended by the attention module, that forces the network to create a voice signal or at least modify the voice signal being generated for neighboring frames.
-- ~~Simpler stop-token prediction. Right now we use RNN to keep the history of the previous frames. However, we never tested, if something simpler would work as well.~~ Yet RNN based model gives more stable predictions.
-- Train for better mel-specs. Mel-spectrograms are not good enough to be fed Neural Vocoder. Easy solution to this problem is to train the model with r=1. However, in this case, model struggles to align the attention.
-- irregular words: "minute", "focus", "aren't" etc. Even though ~~it might be solved~~ (Use a better dataset like Nancy or train phonemes enabled.)
+-->
## Major TODOs
- [x] Implement the model.
- [x] Generate human-like speech on LJSpeech dataset.
- [x] Generate human-like speech on a different dataset (Nancy) (TWEB).
- [x] Train TTS with r=1 successfully.
-- [x] Enable process based distributed training. Similar [to] (https://github.com/fastai/imagenet-fast/).
-- [ ] Adapting Neural Vocoder. The most active work is [here] (https://github.com/erogol/WaveRNN)
+- [x] Enable process based distributed training. Similar to (https://github.com/fastai/imagenet-fast/).
+- [x] Adapting Neural Vocoder. TTS works with (https://github.com/erogol/WaveRNN)
- [ ] Multi-speaker embedding.
+- [ ] Model optimization (model export, prunning etc.)
-## References
+
### Precursor implementations
- https://github.com/keithito/tacotron (Dataset and Test processing)
diff --git a/config.json b/config.json
index d07ea7d0..c2efcbe3 100644
--- a/config.json
+++ b/config.json
@@ -1,12 +1,12 @@
{
- "model_name": "queue",
- "model_description": "Queue memory and change lower r incrementatlly",
+ "run_name": "mozilla-no-loc-fattn-stopnet-sigmoid-loss_masking",
+ "run_description": "using forward attention, with original prenet, loss masking,separate stopnet, sigmoid. Compare this with 4817. Pytorch DPP",
"audio":{
// Audio processing parameters
"num_mels": 80, // size of the mel spec frame.
"num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
- "sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled.
+ "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
"frame_length_ms": 50, // stft window length in ms.
"frame_shift_ms": 12.5, // stft window hop-lengh in ms.
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
@@ -19,8 +19,8 @@
"symmetric_norm": false, // move normalization to range [-1, 1]
"max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
- "mel_fmin": null, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
- "mel_fmax": null, // maximum freq level for mel-spec. Tune for dataset!!
+ "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+ "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
"do_trim_silence": true // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
},
@@ -29,38 +29,52 @@
"url": "tcp:\/\/localhost:54321"
},
- "embedding_size": 256, // Character embedding vector length. You don't need to change it in general.
- "text_cleaner": "phoneme_cleaners",
- "epochs": 1000, // total number of epochs to train.
- "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
- "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
- "loss_weight": 0.0, // loss weight to emphasize lower frequencies. Lower frequencies are in general more important for speech signals.
- "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
- "windowing": false, // Enables attention windowing. Used only in eval mode.
- "memory_size": 5, // memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
+ "reinit_layers": [],
- "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention.
- "eval_batch_size":32,
- "r": 5, // Number of frames to predict for step.
- "wd": 0.000001, // Weight decay weight.
- "checkpoint": true, // If true, it saves checkpoints per "save_step"
- "save_step": 5000, // Number of training steps expected to save traning stats and checkpoints.
- "print_step": 50, // Number of steps to log traning on console.
+ "model": "Tacotron2", // one of the model in models/
+ "grad_clip": 1, // upper limit for gradients for clipping.
+ "epochs": 1000, // total number of epochs to train.
+ "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
+ "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
+ "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
+ "windowing": false, // Enables attention windowing. Used only in eval mode.
+ "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
+ "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
+ "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn".
+ "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet.
+ "use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster.
+ "transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention.
+ "location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default.
+ "loss_masking": true, // enable / disable loss masking against the sequence padding.
+ "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
+ "stopnet": true, // Train stopnet predicting the end of synthesis.
+ "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
- "batch_group_size": 8, //Number of batches to shuffle after bucketing.
+
+ "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention.
+ "eval_batch_size":16,
+ "r": 1, // Number of frames to predict for step.
+ "wd": 0.000001, // Weight decay weight.
+ "checkpoint": true, // If true, it saves checkpoints per "save_step"
+ "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
+ "print_step": 10, // Number of steps to log traning on console.
+ "batch_group_size": 0, //Number of batches to shuffle after bucketing.
"run_eval": true,
- "test_delay_epochs": 100, //Until attention is aligned, testing only wastes computation time.
- "data_path": "/media/erogol/data_ssd/Data/LJSpeech-1.1", // DATASET-RELATED: can overwritten from command argument
- "meta_file_train": "metadata_train.csv", // DATASET-RELATED: metafile for training dataloader.
- "meta_file_val": "metadata_val.csv", // DATASET-RELATED: metafile for evaluation dataloader.
- "dataset": "ljspeech", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
+ "test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time.
+ "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
+ "data_path": "/media/erogol/data_ssd/Data/Mozilla/", // DATASET-RELATED: can overwritten from command argument
+ "meta_file_train": "metadata_train.txt", // DATASET-RELATED: metafile for training dataloader.
+ "meta_file_val": "metadata_val.txt", // DATASET-RELATED: metafile for evaluation dataloader.
+ "dataset": "mozilla", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
"min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training
- "max_seq_len": 300, // DATASET-RELATED: maximum text length
- "output_path": "/media/erogol/data_ssd/Data/models/ljspeech_models/", // DATASET-RELATED: output path for all training outputs.
- "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
+ "max_seq_len": 150, // DATASET-RELATED: maximum text length
+ "output_path": "../keep/", // DATASET-RELATED: output path for all training outputs.
+ "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"num_val_loader_workers": 4, // number of evaluation data loader processes.
- "phoneme_cache_path": "ljspeech_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
+ "phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
- "phoneme_language": "en-us" // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
+ "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
+ "text_cleaner": "phoneme_cleaners"
}
+
diff --git a/config_cluster.json b/config_cluster.json
index 073923be..2c05ca41 100644
--- a/config_cluster.json
+++ b/config_cluster.json
@@ -1,65 +1,80 @@
{
- "model_name": "tts-master",
- "model_description": "tts master with symbols update",
-
- "audio":{
- "audio_processor": "audio", // to use dictate different audio processors, if available.
- // Audio processing parameters
- "num_mels": 80, // size of the mel spec frame.
- "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
- "sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled.
- "frame_length_ms": 50, // stft window length in ms.
- "frame_shift_ms": 12.5, // stft window hop-lengh in ms.
- "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
- "min_level_db": -100, // normalization range
- "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
- "power": 1.5, // value to sharpen wav signals after GL algorithm.
- "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
- // Normalization parameters
- "signal_norm": true, // normalize the spec values in range [0, 1]
- "symmetric_norm": false, // move normalization to range [-1, 1]
- "max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
- "clip_norm": true, // clip normalized values into the range.
- "mel_fmin": null, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
- "mel_fmax": null, // maximum freq level for mel-spec. Tune for dataset!!
- "do_trim_silence": true // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
- },
-
- "distributed":{
- "backend": "nccl",
- "url": "tcp:\/\/localhost:54321"
- },
-
- "embedding_size": 256, // Character embedding vector length. You don't need to change it in general.
- "text_cleaner": "phoneme_cleaners",
- "epochs": 1000, // total number of epochs to train.
- "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
- "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
- "loss_weight": 0.0, // loss weight to emphasize lower frequencies. Lower frequencies are in general more important for speech signals.
- "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
- "windowing": false, // Enables attention windowing. Used only in eval mode.
- "memory_size": 5, // memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
-
- "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention.
- "eval_batch_size":32,
- "r": 5, // Number of frames to predict for step.
- "wd": 0.00001, // Weight decay weight.
- "checkpoint": true, // If true, it saves checkpoints per "save_step"
- "save_step": 5000, // Number of training steps expected to save traning stats and checkpoints.
- "print_step": 50, // Number of steps to log traning on console.
- "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
-
- "run_eval": true,
- "data_path": "/media/erogol/data_ssd/Data/LJSpeech-1.1", // DATASET-RELATED: can overwritten from command argument
- "meta_file_train": "metadata_train.csv", // DATASET-RELATED: metafile for training dataloader.
- "meta_file_val": "metadata_val.csv", // DATASET-RELATED: metafile for evaluation dataloader.
- "dataset": "ljspeech", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
- "min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training
- "max_seq_len": 300, // DATASET-RELATED: maximum text length
- "output_path": "models/", // DATASET-RELATED: output path for all training outputs.
- "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
- "num_val_loader_workers": 4, // number of evaluation data loader processes.
- "phoneme_cache_path": "phonemes_cache", // phoneme computation is slow, therefore, it caches results in the given folder.
- "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
- "phoneme_language": "en-us" // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
-}
+ "run_name": "mozilla-no-loc-fattn-stopnet-sigmoid-loss_masking",
+ "run_description": "using forward attention, with original prenet, loss masking,separate stopnet, sigmoid. Compare this with 4817. Pytorch DPP",
+
+ "audio":{
+ // Audio processing parameters
+ "num_mels": 80, // size of the mel spec frame.
+ "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
+ "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
+ "frame_length_ms": 50, // stft window length in ms.
+ "frame_shift_ms": 12.5, // stft window hop-lengh in ms.
+ "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+ "min_level_db": -100, // normalization range
+ "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
+ "power": 1.5, // value to sharpen wav signals after GL algorithm.
+ "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+ // Normalization parameters
+ "signal_norm": true, // normalize the spec values in range [0, 1]
+ "symmetric_norm": false, // move normalization to range [-1, 1]
+ "max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+ "clip_norm": true, // clip normalized values into the range.
+ "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+ "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
+ "do_trim_silence": true // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+ },
+
+ "distributed":{
+ "backend": "nccl",
+ "url": "tcp:\/\/localhost:54321"
+ },
+
+ "reinit_layers": [],
+
+ "model": "Tacotron2", // one of the model in models/
+ "grad_clip": 1, // upper limit for gradients for clipping.
+ "epochs": 1000, // total number of epochs to train.
+ "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
+ "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
+ "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
+ "windowing": false, // Enables attention windowing. Used only in eval mode.
+ "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
+ "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
+ "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn".
+ "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet.
+ "use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster.
+ "transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention.
+ "location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default.
+ "loss_masking": true, // enable / disable loss masking against the sequence padding.
+ "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
+ "stopnet": true, // Train stopnet predicting the end of synthesis.
+ "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
+ "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+
+ "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention.
+ "eval_batch_size":16,
+ "r": 1, // Number of frames to predict for step.
+ "wd": 0.000001, // Weight decay weight.
+ "checkpoint": true, // If true, it saves checkpoints per "save_step"
+ "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
+ "print_step": 10, // Number of steps to log traning on console.
+ "batch_group_size": 0, //Number of batches to shuffle after bucketing.
+
+ "run_eval": true,
+ "test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time.
+ "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
+ "data_path": "/media/erogol/data_ssd/Data/Mozilla/", // DATASET-RELATED: can overwritten from command argument
+ "meta_file_train": "metadata_train.txt", // DATASET-RELATED: metafile for training dataloader.
+ "meta_file_val": "metadata_val.txt", // DATASET-RELATED: metafile for evaluation dataloader.
+ "dataset": "mozilla", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
+ "min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training
+ "max_seq_len": 150, // DATASET-RELATED: maximum text length
+ "output_path": "../keep/", // DATASET-RELATED: output path for all training outputs.
+ "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
+ "num_val_loader_workers": 4, // number of evaluation data loader processes.
+ "phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
+ "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
+ "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
+ "text_cleaner": "phoneme_cleaners"
+ }
+
diff --git a/config_tacotron.json b/config_tacotron.json
new file mode 100644
index 00000000..968eae1e
--- /dev/null
+++ b/config_tacotron.json
@@ -0,0 +1,79 @@
+{
+ "run_name": "mozilla-tacotron-tagent",
+ "run_description": "using forward attention with transition agent, with original prenet, loss masking, separate stopnet, sigmoid norm. Compare this with 4841",
+
+ "audio":{
+ // Audio processing parameters
+ "num_mels": 80, // size of the mel spec frame.
+ "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
+ "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
+ "frame_length_ms": 50, // stft window length in ms.
+ "frame_shift_ms": 12.5, // stft window hop-lengh in ms.
+ "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+ "min_level_db": -100, // normalization range
+ "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
+ "power": 1.5, // value to sharpen wav signals after GL algorithm.
+ "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+ // Normalization parameters
+ "signal_norm": true, // normalize the spec values in range [0, 1]
+ "symmetric_norm": false, // move normalization to range [-1, 1]
+ "max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+ "clip_norm": true, // clip normalized values into the range.
+ "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+ "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
+ "do_trim_silence": true // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+ },
+
+ "distributed":{
+ "backend": "nccl",
+ "url": "tcp:\/\/localhost:54321"
+ },
+
+ "reinit_layers": [],
+
+ "model": "Tacotron", // one of the model in models/
+ "grad_clip": 1, // upper limit for gradients for clipping.
+ "epochs": 1000, // total number of epochs to train.
+ "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
+ "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
+ "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
+ "windowing": false, // Enables attention windowing. Used only in eval mode.
+ "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
+ "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
+ "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn".
+ "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet.
+ "use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster.
+ "transition_agent": true, // ONLY TACOTRON2 - enable/disable transition agent of forward attention.
+ "location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default.
+ "loss_masking": true, // enable / disable loss masking against the sequence padding.
+ "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
+ "stopnet": true, // Train stopnet predicting the end of synthesis.
+ "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
+ "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+
+ "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention.
+ "eval_batch_size":16,
+ "r": 5, // Number of frames to predict for step.
+ "wd": 0.000001, // Weight decay weight.
+ "checkpoint": true, // If true, it saves checkpoints per "save_step"
+ "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
+ "print_step": 10, // Number of steps to log traning on console.
+ "batch_group_size": 0, //Number of batches to shuffle after bucketing.
+
+ "run_eval": true,
+ "test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time.
+ "data_path": "/media/erogol/data_ssd/Data/Mozilla/", // DATASET-RELATED: can overwritten from command argument
+ "meta_file_train": "metadata_train.txt", // DATASET-RELATED: metafile for training dataloader.
+ "meta_file_val": "metadata_val.txt", // DATASET-RELATED: metafile for evaluation dataloader.
+ "dataset": "mozilla", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
+ "min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training
+ "max_seq_len": 150, // DATASET-RELATED: maximum text length
+ "output_path": "../keep/", // DATASET-RELATED: output path for all training outputs.
+ "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
+ "num_val_loader_workers": 4, // number of evaluation data loader processes.
+ "phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder.
+ "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
+ "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
+ "text_cleaner": "phoneme_cleaners"
+ }
+
\ No newline at end of file
diff --git a/config_tacotron_de.json b/config_tacotron_de.json
new file mode 100644
index 00000000..7f221c64
--- /dev/null
+++ b/config_tacotron_de.json
@@ -0,0 +1,94 @@
+{
+ "run_name": "german-tacotron-tagent",
+ "run_description": "using forward attention with transition agent, with original prenet, loss masking, separate stopnet, sigmoid norm. First run German data.",
+
+ "audio":{
+ // Audio processing parameters
+ "num_mels": 80, // size of the mel spec frame.
+ "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame.
+ "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
+ "frame_length_ms": 50, // stft window length in ms.
+ "frame_shift_ms": 12.5, // stft window hop-lengh in ms.
+ "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+ "min_level_db": -100, // normalization range
+ "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
+ "power": 1.5, // value to sharpen wav signals after GL algorithm.
+ "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+ // Normalization parameters
+ "signal_norm": true, // normalize the spec values in range [0, 1]
+ "symmetric_norm": false, // move normalization to range [-1, 1]
+ "max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+ "clip_norm": true, // clip normalized values into the range.
+ "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+ "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
+ "do_trim_silence": true // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+ },
+
+ "distributed":{
+ "backend": "nccl",
+ "url": "tcp:\/\/localhost:54321"
+ },
+
+ "reinit_layers": [],
+
+ "model": "Tacotron", // one of the model in models/
+ "grad_clip": 1, // upper limit for gradients for clipping.
+ "epochs": 1000, // total number of epochs to train.
+ "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
+ "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
+ "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
+ "windowing": false, // Enables attention windowing. Used only in eval mode.
+ "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
+ "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
+ "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn".
+ "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet.
+ "use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster.
+ "transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention.
+ "location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default.
+ "loss_masking": true, // enable / disable loss masking against the sequence padding.
+ "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
+ "stopnet": true, // Train stopnet predicting the end of synthesis.
+ "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.
+ "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+
+ "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention.
+ "eval_batch_size":16,
+ "r": 5, // Number of frames to predict for step.
+ "wd": 0.000001, // Weight decay weight.
+ "checkpoint": true, // If true, it saves checkpoints per "save_step"
+ "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
+ "print_step": 10, // Number of steps to log traning on console.
+ "batch_group_size": 0, //Number of batches to shuffle after bucketing.
+
+ "run_eval": false,
+ "test_sentences_file": "de_sentences.txt", // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
+ "test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time.
+ "data_path": "/media/erogol/data_ssd/Data/Mozilla/", // DATASET-RELATED: can overwritten from command argument
+ "meta_file_train": [
+ "kleinzaches/metadata.csv",
+ "spiegel_kaetzchen/metadata.csv",
+ "herrnarnesschatz/metadata.csv",
+ "maedchen_von_moorhof/metadata.csv",
+ "koenigsgaukler/metadata.csv",
+ "altehous/metadata.csv",
+ "odysseus/metadata.csv",
+ "undine/metadata.csv",
+ "reise_tilsit/metadata.csv",
+ "schmied_seines_glueckes/metadata.csv",
+ "kammmacher/metadata.csv",
+ "unterm_birnbaum/metadata.csv",
+ "liebesbriefe/metadata.csv",
+ "sandmann/metadata.csv"], // DATASET-RELATED: metafile for training dataloader.
+ "meta_file_val": "metadata_val.txt", // DATASET-RELATED: metafile for evaluation dataloader.
+ "dataset": "mailabs", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
+ "min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training
+ "max_seq_len": 150, // DATASET-RELATED: maximum text length
+ "output_path": "../keep/", // DATASET-RELATED: output path for all training outputs.
+ "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
+ "num_val_loader_workers": 4, // number of evaluation data loader processes.
+ "phoneme_cache_path": "phoneme_cache", // phoneme computation is slow, therefore, it caches results in the given folder.
+ "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
+ "phoneme_language": "de", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
+ "text_cleaner": "phoneme_cleaners"
+ }
+
\ No newline at end of file
diff --git a/dataset_analysis/AnalyzeDataset.ipynb b/dataset_analysis/AnalyzeDataset.ipynb
index 784784ef..3ed54ded 100644
--- a/dataset_analysis/AnalyzeDataset.ipynb
+++ b/dataset_analysis/AnalyzeDataset.ipynb
@@ -2,142 +2,132 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "TTS_PATH = \"/home/erogol/projects/\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
+ "import sys\n",
+ "sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n",
"import glob\n",
"import librosa\n",
"import numpy as np\n",
"import pandas as pd\n",
+ "from scipy.stats import norm\n",
"from tqdm import tqdm_notebook as tqdm\n",
"from multiprocessing import Pool\n",
"from matplotlib import pylab as plt\n",
"from collections import Counter\n",
+ "from TTS.datasets.preprocess import *\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "DATA_PATH = \"../../../Data/LJSpeech-1.1/wavs/\"\n",
- "META_PATH = \"../../../Data/LJSpeech-1.1/metadata.csv\"\n",
+ "DATA_PATH = \"/home/erogol/Data/m-ai-labs/de_DE/by_book/male/karlsson/\"\n",
+ "META_DATA = [\"kleinzaches/metadata.csv\",\n",
+ " \"spiegel_kaetzchen/metadata.csv\",\n",
+ " \"herrnarnesschatz/metadata.csv\",\n",
+ " \"maedchen_von_moorhof/metadata.csv\",\n",
+ " \"koenigsgaukler/metadata.csv\",\n",
+ " \"altehous/metadata.csv\",\n",
+ " \"odysseus/metadata.csv\",\n",
+ " \"undine/metadata.csv\",\n",
+ " \"reise_tilsit/metadata.csv\",\n",
+ " \"schmied_seines_glueckes/metadata.csv\",\n",
+ " \"kammmacher/metadata.csv\",\n",
+ " \"unterm_birnbaum/metadata.csv\",\n",
+ " \"liebesbriefe/metadata.csv\",\n",
+ " \"sandmann/metadata.csv\"]\n",
"NUM_PROC = 8"
]
},
{
"cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " > Number of audio files: 13100\n"
- ]
- }
- ],
- "source": [
- "file_names = glob.glob(os.path.join(DATA_PATH, \"*.wav\"))\n",
- "print(\" > Number of audio files: {}\".format(len(file_names)))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "meta_f = open(META_PATH, 'r', encoding='utf8')\n",
- "meta = [m.split(\"|\") for m in meta_f.readlines()]"
+ "# use your own preprocessor at this stage - TTS/datasets/proprocess.py\n",
+ "items = mailabs(DATA_PATH, META_DATA)\n",
+ "print(\" > Number of audio files: {}\".format(len(items)))"
]
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "f899c42f6f514ab9bf3834e5facef6a3",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "HBox(children=(IntProgress(value=0, max=13100), HTML(value='')))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- }
- ],
+ "outputs": [],
+ "source": [
+ "# check wavs if exist\n",
+ "wav_files = []\n",
+ "for item in items:\n",
+ " wav_file = item[1].strip()\n",
+ " wav_files.append(wav_file)\n",
+ " if not os.path.exists(wav_file):\n",
+ " print(waf_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# show duplicate items\n",
+ "c = Counter(wav_files)\n",
+ "print([item for item, count in c.items() if count > 1])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
"source": [
"def load_item(item):\n",
- " file_name = item[0]\n",
- " text = item[2]\n",
- " audio = librosa.load(os.path.join(DATA_PATH, file_name+'.wav'))\n",
+ " file_name = item[1].strip()\n",
+ " text = item[0].strip()\n",
+ " audio = librosa.load(file_name, sr=None)\n",
" sr = audio[1]\n",
" audio = audio[0]\n",
" audio_len = len(audio) / sr\n",
" text_len = len(text)\n",
- " return text, text_len, audio, audio_len\n",
+ " return file_name, text, text_len, audio, audio_len\n",
"\n",
"# This will take a while depending on size of dataset\n",
"if NUM_PROC == 1:\n",
" data = []\n",
- " for m in tqdm(meta):\n",
+ " for m in tqdm(items):\n",
" data += [load_item(m)]\n",
"else:\n",
" with Pool(8) as p:\n",
- " data = list(tqdm(p.imap(load_item, meta), total=len(meta)))"
+ " data = list(tqdm(p.imap(load_item, items), total=len(items)))"
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "e42aca59abe14f8bb32b5d5f19af1c67",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "HBox(children=(IntProgress(value=0, max=13100), HTML(value='')))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- " > Number of words: 22943\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# count words in the dataset\n",
"w_count = Counter()\n",
"for item in tqdm(data):\n",
- " text = item[0].lower()\n",
+ " text = item[1].lower().strip()\n",
" for word in text.split():\n",
" w_count[word] += 1\n",
"print(\" > Number of words: {}\".format(len(w_count)))"
@@ -145,36 +135,14 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "647a2e1810324971aacb971acff91fb3",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "HBox(children=(IntProgress(value=0, max=13100), HTML(value='')))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"text_vs_durs = {} # text length vs audio duration\n",
"text_len_counter = Counter() # number of sentences with the keyed length\n",
"for item in tqdm(data):\n",
- " text = item[0].lower()\n",
+ " text = item[1].lower().strip()\n",
" text_len = len(text)\n",
" text_len_counter[text_len] += 1\n",
" audio_len = item[-1]\n",
@@ -186,7 +154,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -200,6 +168,70 @@
" text_vs_std[key] = np.std(durs)"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Avg audio length per char"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for item in data:\n",
+ " if item[-1] < 2:\n",
+ " print(item)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sec_per_chars = []\n",
+ "for item in data:\n",
+ " text = item[1]\n",
+ " dur = item[-1]\n",
+ " sec_per_char = dur / len(text)\n",
+ " sec_per_chars.append(sec_per_char)\n",
+ "# sec_per_char /= len(data)\n",
+ "# print(sec_per_char)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "mean = np.mean(sec_per_chars)\n",
+ "std = np.std(sec_per_chars)\n",
+ "print(mean)\n",
+ "print(std)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dist = norm(mean, std)\n",
+ "\n",
+ "# find irregular instances long or short voice durations\n",
+ "for item in data:\n",
+ " text = item[1]\n",
+ " dur = item[-1]\n",
+ " sec_per_char = dur / len(text)\n",
+ " pdf =norm.pdf(sec_per_char)\n",
+ " if pdf < 0.39:\n",
+ " print(item)"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -209,30 +241,9 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"plt.title(\"text length vs mean audio duration\")\n",
"plt.scatter(list(text_vs_avg.keys()), list(text_vs_avg.values()))"
@@ -240,30 +251,9 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"plt.title(\"text length vs median audio duration\")\n",
"plt.scatter(list(text_vs_median.keys()), list(text_vs_median.values()))"
@@ -271,30 +261,9 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"plt.title(\"text length vs STD\")\n",
"plt.scatter(list(text_vs_std.keys()), list(text_vs_std.values()))"
@@ -302,30 +271,9 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"plt.title(\"text length vs # instances\")\n",
"plt.scatter(list(text_len_counter.keys()), list(text_len_counter.values()))"
@@ -340,7 +288,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -350,377 +298,20 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": null,
"metadata": {
"scrolled": true
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "