"# 'num_mels': 80, # In general, you don'tneed to change it \n",
"# 'fft_size': 1024, # In general, you don'tneed to change it \n",
"# 'sample_rate': 22050, # It depends to the sample rate of the dataset.\n",
"# 'hop_length': 256, # In general, you don'tneed to change it \n",
"# 'win_length': 1024, # In general, you don'tneed to change it \n",
"# 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99.\n",
"# 'min_level_db': -100,\n",
"# 'ref_level_db': 20, # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below.\n",
"# 'power': 1.5, # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values.\n",
"# 'griffin_lim_iters': 60, # It does not give any imporvement for values > 60\n",
"# 'signal_norm': True, # This is more about your model. It does not give any change for the synthsis performance.\n",
"# 'symmetric_norm': False, # Same as above\n",
"# 'max_norm': 1, # Same as above\n",
"# 'clip_norm': True, # Same as above\n",
"# 'mel_fmin': 0.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n",
"# 'mel_fmax': 8000.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n",
"# 'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
"Optimize your parameters by comparing different values per parameter at a time."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"audio={\n",
" 'audio_processor': 'audio',\n",
" 'num_mels': 80, # In general, you don'tneed to change it \n",
" 'num_freq': 1025, # In general, you don'tneed to change it \n",
" 'sample_rate': 22050, # It depends to the sample rate of the dataset.\n",
" 'frame_length_ms': 50, # In general, you don'tneed to change it \n",
" 'frame_shift_ms': 12.5, # In general, you don'tneed to change it \n",
" 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99.\n",
" 'min_level_db': -100,\n",
" 'ref_level_db': 20, # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below.\n",
" 'power': 1.5, # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values.\n",
" 'griffin_lim_iters': 60, # It does not give any imporvement for values > 60\n",
" 'signal_norm': True, # This is more about your model. It does not give any change for the synthsis performance.\n",
" 'symmetric_norm': False, # Same as above\n",
" 'max_norm': 1, # Same as above\n",
" 'clip_norm': True, # Same as above\n",
" 'mel_fmin': 0.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n",
" 'mel_fmax': 8000.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n",
" 'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",