"Play with the AP parameters until you find a good fit with the synthesis speech below. "
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"Collapsed": "false"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" > Setting up Audio Processor...\n",
" | > sample_rate:22050\n",
" | > num_mels:80\n",
" | > min_level_db:-100\n",
" | > frame_shift_ms:None\n",
" | > frame_length_ms:None\n",
" | > ref_level_db:20\n",
" | > fft_size:1024\n",
" | > power:1.5\n",
" | > preemphasis:0.0\n",
" | > griffin_lim_iters:60\n",
" | > signal_norm:True\n",
" | > symmetric_norm:True\n",
" | > mel_fmin:0\n",
" | > mel_fmax:8000.0\n",
" | > spec_gain:1.0\n",
" | > stft_pad_mode:reflect\n",
" | > max_norm:4.0\n",
" | > clip_norm:True\n",
" | > do_trim_silence:True\n",
" | > trim_db:60\n",
" | > do_sound_norm:True\n",
" | > stats_path:None\n",
" | > hop_length:256\n",
" | > win_length:1024\n"
]
}
],
"source": [
"# audio={\n",
"# 'audio_processor': 'audio',\n",
"# 'num_mels': 80, # In general, you don'tneed to change it \n",
"# 'fft_size': 1024, # In general, you don'tneed to change it \n",
"# 'sample_rate': 22050, # It depends to the sample rate of the dataset.\n",
"# 'hop_length': 256, # In general, you don'tneed to change it \n",
"# 'win_length': 1024, # In general, you don'tneed to change it \n",
"# 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99.\n",
"# 'min_level_db': -100,\n",
"# 'ref_level_db': 20, # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below.\n",
"# 'power': 1.5, # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values.\n",
"# 'griffin_lim_iters': 60, # It does not give any imporvement for values > 60\n",
"# 'signal_norm': True, # This is more about your model. It does not give any change for the synthsis performance.\n",
"# 'symmetric_norm': False, # Same as above\n",
"# 'max_norm': 1, # Same as above\n",
"# 'clip_norm': True, # Same as above\n",
"# 'mel_fmin': 0.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n",
"# 'mel_fmax': 8000.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n",
"# 'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
"\u001b[0;32m~/Projects/TTS/tts/utils/audio.py\u001b[0m in \u001b[0;36mnormalize\u001b[0;34m(self, S)\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinear_scaler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mS\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 119\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' [!] Mean-Var stats does not match the given feature dimensions.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 120\u001b[0m \u001b[0;31m# range normalization\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[0mS\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mref_level_db\u001b[0m \u001b[0;31m# discard certain range of DB assuming it is air noise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mRuntimeError\u001b[0m: [!] Mean-Var stats does not match the given feature dimensions."
]
}
],
"source": [
"spec = AP.spectrogram(wav)\n",
"print(\"Max:\", spec.max())\n",
"print(\"Min:\", spec.min())\n",
"print(\"Mean:\", spec.mean())\n",
"plot_spectrogram(spec.T, AP);\n",
"\n",
"wav_gen = AP.inv_spectrogram(spec)\n",
"ipd.Audio(wav_gen, rate=AP.sample_rate)"
]
},
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false"
},
"source": [
"### Compare values for a certain parameter\n",
"\n",
"Optimize your parameters by comparing different values per parameter at a time."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"audio={\n",
" 'audio_processor': 'audio',\n",
" 'num_mels': 80, # In general, you don'tneed to change it \n",
" 'num_freq': 1025, # In general, you don'tneed to change it \n",
" 'sample_rate': 22050, # It depends to the sample rate of the dataset.\n",
" 'frame_length_ms': 50, # In general, you don'tneed to change it \n",
" 'frame_shift_ms': 12.5, # In general, you don'tneed to change it \n",
" 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99.\n",
" 'min_level_db': -100,\n",
" 'ref_level_db': 20, # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below.\n",
" 'power': 1.5, # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values.\n",
" 'griffin_lim_iters': 60, # It does not give any imporvement for values > 60\n",
" 'signal_norm': True, # This is more about your model. It does not give any change for the synthsis performance.\n",
" 'symmetric_norm': False, # Same as above\n",
" 'max_norm': 1, # Same as above\n",
" 'clip_norm': True, # Same as above\n",
" 'mel_fmin': 0.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n",
" 'mel_fmax': 8000.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n",
" 'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",