2020-06-09 23:05:02 +02:00
{
"run_name" : "multiband-melgan" ,
2020-06-22 14:56:12 +02:00
"run_description" : "multiband melgan mean-var scaling" ,
2020-06-09 23:05:02 +02:00
// AUDIO PARAMETERS
"audio" : {
2020-06-22 14:56:12 +02:00
"fft_size" : 1024 , // number of stft frequency levels. Size of the linear spectogram frame.
2020-06-09 23:05:02 +02:00
"win_length" : 1024 , // stft window length in ms.
"hop_length" : 256 , // stft window hop-lengh in ms.
"frame_length_ms" : null , // stft window length in ms.If null, 'win_length' is used.
"frame_shift_ms" : null , // stft window hop-lengh in ms. If null, 'hop_length' is used.
// Audio processing parameters
"sample_rate" : 22050 , // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
"preemphasis" : 0.0 , // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
2021-04-08 13:18:03 +02:00
"ref_level_db" : 20 , // reference level db, theoretically 20db is the sound of air.
"log_func" : "np.log10" ,
"do_sound_norm" : true ,
2020-06-09 23:05:02 +02:00
// Silence trimming
2021-04-08 13:18:03 +02:00
"do_trim_silence" : false , // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
2020-06-09 23:05:02 +02:00
"trim_db" : 60 , // threshold for timming silence. Set this according to your dataset.
// MelSpectrogram parameters
"num_mels" : 80 , // size of the mel spec frame.
2020-06-22 14:56:12 +02:00
"mel_fmin" : 50.0 , // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax" : 7600.0 , // maximum freq level for mel-spec. Tune for dataset!!
"spec_gain" : 1.0 , // scaler value appplied after log transform of spectrogram.
2020-06-09 23:05:02 +02:00
// Normalization parameters
"signal_norm" : true , // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
"min_level_db" : -100 , // lower bound for normalization
"symmetric_norm" : true , // move normalization to range [-1, 1]
"max_norm" : 4.0 , // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm" : true , // clip normalized values into the range.
2021-04-08 13:18:03 +02:00
"stats_path" : null
2020-06-09 23:05:02 +02:00
} ,
// DISTRIBUTED TRAINING
// "distributed":{
// "backend": "nccl",
// "url": "tcp:\/\/localhost:54321"
// },
// MODEL PARAMETERS
"use_pqmf" : true ,
// LOSS PARAMETERS
"use_stft_loss" : true ,
"use_subband_stft_loss" : true ,
"use_mse_gan_loss" : true ,
"use_hinge_gan_loss" : false ,
"use_feat_match_loss" : false , // use only with melgan discriminators
2021-04-07 12:36:36 +02:00
"use_l1_spec_loss" : true ,
2020-06-09 23:05:02 +02:00
// loss weights
"stft_loss_weight" : 0.5 ,
"subband_stft_loss_weight" : 0.5 ,
"mse_G_loss_weight" : 2.5 ,
"hinge_G_loss_weight" : 2.5 ,
"feat_match_loss_weight" : 25 ,
2021-04-07 12:36:36 +02:00
"l1_spec_loss_weight" : 2.5 ,
2020-06-09 23:05:02 +02:00
// multiscale stft loss parameters
"stft_loss_params" : {
"n_ffts" : [ 1024 , 2048 , 512 ] ,
"hop_lengths" : [ 120 , 240 , 50 ] ,
"win_lengths" : [ 600 , 1200 , 240 ]
} ,
// subband multiscale stft loss parameters
"subband_stft_loss_params" : {
"n_ffts" : [ 384 , 683 , 171 ] ,
"hop_lengths" : [ 30 , 60 , 10 ] ,
"win_lengths" : [ 150 , 300 , 60 ]
} ,
2021-04-07 12:36:36 +02:00
"l1_spec_loss_params" : {
"use_mel" : true ,
"sample_rate" : 22050 ,
"n_fft" : 1024 ,
"hop_length" : 256 ,
"win_length" : 1024 ,
"n_mels" : 80 ,
"mel_fmin" : 0.0 ,
"mel_fmax" : null
} ,
2021-11-30 15:57:12 +01:00
"target_loss" : "G_avg_loss" , // loss value to pick the best model to save after each epoch
2020-06-09 23:05:02 +02:00
// DISCRIMINATOR
"discriminator_model" : "melgan_multiscale_discriminator" ,
"discriminator_model_params" : {
"base_channels" : 16 ,
2020-06-17 11:13:16 +02:00
"max_channels" : 512 ,
"downsample_factors" : [ 4 , 4 , 4 ]
2020-06-09 23:05:02 +02:00
} ,
"steps_to_start_discriminator" : 200000 , // steps required to start GAN trainining.1
// GENERATOR
"generator_model" : "multiband_melgan_generator" ,
"generator_model_params" : {
"upsample_factors" : [ 8 , 4 , 2 ] ,
"num_res_blocks" : 4
} ,
// DATASET
2020-07-16 15:05:36 +02:00
"data_path" : "tests/data/ljspeech/wavs/" ,
2020-06-15 14:16:31 +02:00
"feature_path" : null ,
2020-06-09 23:05:02 +02:00
"seq_len" : 16384 ,
"pad_short" : 2000 ,
"conv_pad" : 0 ,
"use_noise_augment" : false ,
"use_cache" : true ,
"reinit_layers" : [ ] , // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
// TRAINING
2020-07-16 15:05:36 +02:00
"batch_size" : 4 , // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
2020-06-09 23:05:02 +02:00
// VALIDATION
"run_eval" : true ,
"test_delay_epochs" : 10 , //Until attention is aligned, testing only wastes computation time.
"test_sentences_file" : null , // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
// OPTIMIZER
2020-07-16 15:05:36 +02:00
"epochs" : 1 , // total number of epochs to train.
2020-06-09 23:05:02 +02:00
"wd" : 0.0 , // Weight decay weight.
"gen_clip_grad" : -1 , // Generator gradient clipping threshold. Apply gradient clipping if > 0
"disc_clip_grad" : -1 , // Discriminator gradient clipping threshold.
2021-04-08 13:18:03 +02:00
"optimizer" : "AdamW" ,
"optimizer_params" : {
"betas" : [ 0.8 , 0.99 ] ,
"weight_decay" : 0.0
} ,
2020-06-09 23:05:02 +02:00
"lr_scheduler_gen" : "MultiStepLR" , // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
"lr_scheduler_gen_params" : {
"gamma" : 0.5 ,
"milestones" : [ 100000 , 200000 , 300000 , 400000 , 500000 , 600000 ]
} ,
"lr_scheduler_disc" : "MultiStepLR" , // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
"lr_scheduler_disc_params" : {
"gamma" : 0.5 ,
"milestones" : [ 100000 , 200000 , 300000 , 400000 , 500000 , 600000 ]
} ,
"lr_gen" : 1e-4 , // Initial learning rate. If Noam decay is active, maximum learning rate.
"lr_disc" : 1e-4 ,
// TENSORBOARD and LOGGING
2020-07-16 15:05:36 +02:00
"print_step" : 1 , // Number of steps to log traning on console.
2020-06-09 23:05:02 +02:00
"print_eval" : false , // If True, it prints loss values for each step in eval run.
"save_step" : 25000 , // Number of training steps expected to plot training stats on TB and save model checkpoints.
"checkpoint" : true , // If true, it saves checkpoints per "save_step"
2021-02-15 18:40:17 +01:00
"keep_all_best" : true , // If true, keeps all best_models after keep_after steps
"keep_after" : 10000 , // Global step after which to keep best models if keep_all_best is true
2020-06-09 23:05:02 +02:00
"tb_model_param_stats" : false , // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
// DATA LOADING
2021-02-08 12:25:46 +00:00
"num_loader_workers" : 0 , // number of training data loader processes. Don't set it too big. 4-8 are good values.
2021-06-18 13:24:48 +02:00
"num_eval_loader_workers" : 0 , // number of evaluation data loader processes.
2020-06-09 23:05:02 +02:00
"eval_split_size" : 10 ,
// PATHS
2020-09-07 13:46:35 +02:00
"output_path" : "tests/train_outputs/"
2020-06-09 23:05:02 +02:00
}