Files
TTS/notebooks/dataset_analysis/CheckSpectrograms.ipynb

385 lines
450 KiB
Plaintext
Raw Normal View History

2020-11-17 14:18:53 +01:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"from tts.utils.audio import AudioProcessor\n",
"from tts.tts.utils.visual import plot_spectrogram\n",
"from tts.utils.io import load_config\n",
"import glob "
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"config_path = \"/home/erogol/Projects/TTS/tts/tts/config_thorsten_de.json\"\n",
"data_path = \"/home/erogol/Data/thorsten-german/\"\n",
"file_paths = glob.glob(data_path + \"/**/*.wav\", recursive=True)\n",
"CONFIG = load_config(config_path)"
]
},
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false"
},
"source": [
"### Setup Audio Processor\n",
"Play with the AP parameters until you find a good fit with the synthesis speech below. "
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"Collapsed": "false"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" > Setting up Audio Processor...\n",
" | > sample_rate:22050\n",
" | > num_mels:80\n",
" | > min_level_db:-100\n",
" | > frame_shift_ms:None\n",
" | > frame_length_ms:None\n",
" | > ref_level_db:20\n",
" | > fft_size:1024\n",
" | > power:1.5\n",
" | > preemphasis:0.0\n",
" | > griffin_lim_iters:60\n",
" | > signal_norm:True\n",
" | > symmetric_norm:True\n",
" | > mel_fmin:0\n",
" | > mel_fmax:8000.0\n",
" | > spec_gain:1.0\n",
" | > stft_pad_mode:reflect\n",
" | > max_norm:4.0\n",
" | > clip_norm:True\n",
" | > do_trim_silence:True\n",
" | > trim_db:60\n",
" | > do_sound_norm:True\n",
" | > stats_path:None\n",
" | > hop_length:256\n",
" | > win_length:1024\n"
]
}
],
"source": [
"# audio={\n",
"# 'audio_processor': 'audio',\n",
"# 'num_mels': 80, # In general, you don'tneed to change it \n",
"# 'fft_size': 1024, # In general, you don'tneed to change it \n",
"# 'sample_rate': 22050, # It depends to the sample rate of the dataset.\n",
"# 'hop_length': 256, # In general, you don'tneed to change it \n",
"# 'win_length': 1024, # In general, you don'tneed to change it \n",
"# 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99.\n",
"# 'min_level_db': -100,\n",
"# 'ref_level_db': 20, # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below.\n",
"# 'power': 1.5, # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values.\n",
"# 'griffin_lim_iters': 60, # It does not give any imporvement for values > 60\n",
"# 'signal_norm': True, # This is more about your model. It does not give any change for the synthsis performance.\n",
"# 'symmetric_norm': False, # Same as above\n",
"# 'max_norm': 1, # Same as above\n",
"# 'clip_norm': True, # Same as above\n",
"# 'mel_fmin': 0.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n",
"# 'mel_fmax': 8000.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n",
"# 'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
"\n",
"AP = AudioProcessor(**CONFIG.audio);"
]
},
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false"
},
"source": [
"### Check audio loading "
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"Collapsed": "false"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <audio controls=\"controls\" >\n",
" <source src=\"data:audio/wav;base64,UklGRlCLAgBXQVZFZm10IBAAAAABAAEAIlYAAESsAAACABAAZGF0YSyLAgCy/7P/rP+z/7P/sP+u/7f/sP+7/7v/tf/B/7n/xv+5/8j/wf/I/8r/xP/O/8j/2//V/9X/3//f/+b/6v/q//D/7P/9//v/9f8BAAAAAwALABAACwAUABIAEAAcABwAKQAjAC0AKwAtAC0APwA2ADwANAA6ADYAMgA2ADoAMgA6ADAAKQAlACkALgAlAB4AGgAWAB8ADQAYAAkAEgAAAAAA9f/x////8f/1/+z/6P/h/9//0v/X/9P/zP/Q/73/wv/E/8j/yv+9/8L/uf+//8H/wf/O/8j/0v/T/9X/5v/h//X/5v/3//D//f/5////DQAPAAkAEgANABYAHAAeACUAHgAeABgAHgAWAB4AHAAYABgAFAAYAAkADwALAAsA+f/3/+r/6v/f/+L/3//X/9D/xP/E/8L/u/+9/7f/sP+m/6H/n/+k/5P/o/+V/5P/jv+X/5D/m/+Z/6T/m/+o/5v/rv+j/8T/xP/Q/87/1//h//X/8f//////AwAJABIAGAAjACUANAA0ADYAPwBNAFIAYQBnAG0AbQB4AIEAjgCaAK4AtAC6AL8AzADbAN8A8gD4AP0ACQEFAQoBDgESAR8BFgEbAQwBFAEKARYBCgEFAf0A+wDsAPAA4QDbANYAwwC6AK4AoQCWAJgAhQB8AGEAYQBYAEsAQQAtACkAAwD7//P/3//h/87/yv+//67/qv+o/5f/m/+I/5P/g/+D/3v/hP+E/4P/g/+E/4j/f/+Q/4T/mf+I/5D/k/+V/5f/l/+m/6b/pP+k/6r/rv+q/7n/tf/C/7X/v//C/8H/yP/T/9P/2f/X/93/6v/o/+z/AwD5/wUACQAJABoAGgAlACUAJwArAC4AMgA4ADoAQQA8AEUAQQA/AD4AUABHAFQAPABSAEsATgBaAFwAXQBcAFwAYwBfAGcAbQBrAG4AZwBnAHAAawBuAHIAaQBuAF8AaQBhAF8AYQBcAF8AXQBSAFYAUABUAE4ARwBBAD4APwA/ADoARQA0ADoAPAAwADQAKwA6ACcAOgAfAC4AFAArAB8AHAASABAABQAHAPn/9//u/+z/4v/h/87/2f/M/87/vf/B/7L/sP+s/6b/qP+X/5v/jv+O/4b/iv+B/4H/e/9y/2r/aP9h/2b/Zv9Z/2b/TP9h/1L/W/9S/1L/V/9O/1T/VP9K/1X/UP9U/1n/Tv9S/1L/Vf9Q/1n/VP9V/1T/VP9b/1L/Wf9S/1v/Uv9Z/1T/X/9f/2T/Xf9q/2r/Zv9s/2z/ef9q/3f/dP9//4H/hv+S/4z/l/+V/6H/nf+w/67/uf+y/7L/xP+7/9X/xv/Z/93/4v/m/+7/9//9/wUACQASABQAHAAjACcAOAAtAEcAOgBLAFAAVgBhAF8AaQBnAG0AcgB8AH0AjACBAJoAhwChAJAAoQCdAKcAoQCuAKEAsACwALAAuACrALsArgC2ALIArgCwALAArgCrAKsAowClAKEAnACSAIsAfwB8AHoAdgByAG4AZQBaAFgATQBNAEEAQwAwADIAHwAhABoAGgAPAA8AAQADAPf/+//u/+7/5P/k/9//1f/X/9L/2f/K/9D/0v/I/8j/wv/E/8H/vf+5/7n/s/+3/67/u/+u/7f/sP+z/7P/u/+7/73/u/+z/7//t//C/7//0P/I/8r/zv/I/9L/0v/Z/9n/2//b/+z/4f/u/+j/5P/o/+z/7P/x/+r//f/s//f/7P/9//v////9//v/9//5//v/9////wEA/f8BAPn////3//f/8P/3//H/9f/s//D/6P/w/+T/5P/m/+z/6v/q/+j/3f/h/9n/3f/h/+T/6P/o/93/2//d/+H/4v/k/+b/4v/u/+T/7v/m/+T/4v/f/+H/3//f/+L/3//i/93/5P/Z/+H/3f/o/+z/5P/i/+7/5P/x/+L/8//1//X/+//x//3/9//w//v/8/////f/AAABAAkAAwADAAAAAwAFAAUABQADABQAFAAYABwADwAjABQAHAANAB4AHAAaABoAGgAjABwAIQAaACEAHgAaABoAIQAlADAAGAAaAB4AHwAuABQAIwAlAC4AIQAeABoAKQArACsAFAAnACUAKQAeACMAGgArABYAHAAeACEAHAAWABgAHwAeABYAEAASAB4AHAAPAAkAHAAcACMABwAPABQAFgAPAAAADwALABgACwADAA0ABQAFAP//AAADAAkA+f/5////AQABAPf/9f/7/wsA+f////f/AwAAAPf/7v/9/wAA+f/u//H/9f/7/+j/4f/o//n//f/s/9v/5P/s//P/5v/i//P/8P/s/9//7P/z/wAA8P/z//D/AADm//H/5v8PAPv/9//d//X/AwADAPX/8//7/wkAAQDz//f/AwAFAPn/9f8BAAsACwABAAAACQADAP//9/8NABIACwD3/wAAAwANAAcA//8FABQADwADAAMADQAjABQABwAHABIAFAAPAAkAHAAQABYABwASABYAHAALAAkAFAAYABAABwAHAAMACQAAAAEABwAJAA0AAAAAAA8ACQAFAPv/CQASABwABwAJABIAHgAQAAEABQALABYAFgAHABAADQAWABAAFAAaABIAAwAHABAAHgAPAAEABwAUABwADwD//wsAEAAeAAMA//8JABAADwANAAUAGgAQABQADQAPABoAEAANAAMAHgASABwAAQAUAAkAFAAJAP//CQANAAMAAwD9/wkACwAJAP3/BQAHAAcA+/8BAP//CQADAP////8DAAsA/////wUAAwANAPX/+//9/wAAAAD1//v/AADz//v/7v/5/wMA9f8AAPX//f/7//P////w/w8A+f///+7////9/wMA/f////P/AAD5/wMA9//5//n/8P////H/+//3//H/AwD1//3/8//z////8/////P/9//7//n//f/1//X/AAADAPv/+f/5//f/+//9////9f/5/////f/1//v//f/7//H/+//u//v/7v////H/+f/3//X/9f/9//3///////n/+//3//3/AAABAPv/+f8AAPv//f8DAPv/AADz////9//3/wAA9f8DAPf/+//9//3/+//9////AAD3/wEA+f8FAAAAAQD5//v/AQD//wAA+f8DAAMA+/////X/BwD7/wAAAQD1//3/+f8HAAEABQD5/wUA+f8FAAMAAQABAAUACQAJAP3/AAD9/wEAAQADAAMA+/8BAAEAAAD5/wAA8/8FAP3/BwDx/wAA9/8HAP/////3//f/+f8BAAAA///9//3/+/8AAPv/AAD5//v////7/wcA9f/7/wEA/f/7//P/9////wAA9//3//H/8//5//v//f////f////3//H//f/1//v/8P/9////9f/7/wEAAwD9//v/+//5/wAAAQD1/wMA+/8HAP3///8BAAAAAAABAAEA//8HAAAADQD7/wUA///5////AAD//wAA+/8BAP//AwAAAAAABQD5/wcA+/8BAAMA/f8HAP3/BwD//wMAAAAAAAEABwAAAAEA/f8HAAEAAQD//wcACwAJAAMAAwAJAAUACQADAAkABQANAAAAAwAHAAEACQD5//v/BQALAAcA+/8HAAcA+f8FAP//DwD//////f8LAPn/EAD5/wMAAwALAAAA+f8AAAEACwD9/wkA/f8NAAEABQD7/wEA/f/7/wUA+f8DAP///f/7/wMAAQADAPP/AQD///n/AQAAAPv//f/5/wcA+/8AAPn////9/wAAAAD///3/CQD9/wEA9/8FAP//AwD//w0A//8BAP3/AwAFAP3/AAD3/wcA//8FAAAA/f8FAAEAAwD5/wMABwANAAMABQD9/wUABQAJAAkA+/8QAPn/CQD3/wMAAwAAAAAAAAADAPf/AAD9/wMAAQADAP///f8FAP3/CwD3/w8A//8DAP3/AQAAAAAAAQD9/wEA/f8FAAEAAQD7/wAAAQD//wEA//8AAAAAAAD5/wE
" Your browser does not support the audio element.\n",
" </audio>\n",
" "
],
"text/plain": [
"<IPython.lib.display.Audio object>"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wav = AP.load_wav(file_paths[10])\n",
"ipd.Audio(data=wav, rate=AP.sample_rate) "
]
},
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false"
},
"source": [
"### Generate Mel-Spectrogram and Re-synthesis with GL"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"AP.power = 1.0"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"Collapsed": "false"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Max: 2.4340844\n",
"Min: 2.0181823\n",
"Mean: 2.2137265\n"
]
},
{
"data": {
"text/html": [
"\n",
" <audio controls=\"controls\" >\n",
" <source src=\"data:audio/wav;base64,UklGRiSKAgBXQVZFZm10IBAAAAABAAEAIlYAAESsAAACABAAZGF0YQCKAgDR/83/yf/H/8f/yP/H/8b/xv+//7f/tv+3/7n/vP+4/7b/uf+1/7P/uv+8/7v/uv+1/7P/tv+6/77/v/+9/7z/u/+3/7j/vv/E/8X/wv/E/8j/z//X/93/4//p/+n/6//0////CAANAA8AGwAkACYAMAA1ADUAPwBFAEIARgBOAFQAWQBcAF8AYABfAGMAawBuAG8AbQBpAGkAagBtAHAAcABvAHIAcgBuAGoAbABvAGsAagBqAGQAZABrAGYAXwBeAFkAVABTAFEAUABOAEkARQBCAD0APQA3AC0AKQAoACMAHAAWABIADgAIAAIA/P/z/+3/6f/n/+P/2P/O/83/x/+6/7T/sf+n/5//mv+S/43/i/+I/4P/ff94/3j/ev95/3j/fP+E/4r/i/+P/5f/nf+n/7H/uf/H/9T/2//k/+n/8f8FABUAHgAnADEAPgBLAFEAXgBrAG0AdgCDAIQAjQCeAKUArACzALQAvQDGAMcAygDNAM4AzwDNAM8A1QDRAM0AywDCALoAuACxAK4AqwCgAJcAkACCAHYAdgBzAGgAWQBRAE0ARAA/AEEAOgAxAC4AIAATABEACQAAAP3/7P/c/9v/1f/I/8b/xP+6/6//qv+m/5//mf+Y/5D/gv+B/4T/gP+A/4b/jP+P/43/iv+Q/5T/mf+j/6X/pf+w/7v/wv/L/9b/5f/t/+v/7//6//v/+v8BAAcACQAOABMAFAAUABoAIAAfABsAHgAiACEAHwAkACQAHgAfACMAHgAdAB8AGgAXABUACwAHAAsACgAFAAEAAAD+//T/8P/3//j/8f/v/+3/7P/r/+n/7v/x/+//8//5//f/+f8AAAIABgAHAAoADwAPABEAFgAXABoAHgAYABcAHQAbABgAGgAaABkAGwAcABsAGQAYABkAFgAVABcAGAAZABwAGgAYABsAGwAaABkAFgATABAADQAMAAcA/v/4//f/8v/m/9r/2v/Z/8z/wf+8/7T/rf+s/6r/oP+R/4v/jv+K/4T/gf95/3P/cv9t/2b/Yf9d/1v/Uv9G/0H/P/89/z3/Nf8o/yP/IP8c/xb/Ef8O/wn/A/8D/wP/Af8E/wj/Cf8L/wv/Dv8V/xz/I/8n/yb/K/8z/zj/QP9H/03/U/9Z/13/YP9i/27/ef98/4H/iP+L/5P/nv+o/7D/tv+7/8L/yP/P/9f/3f/n/+3/7P/0/wEACgAVABwAHwAmAC4ANwBEAE0AVABiAGgAbAB6AIQAigCTAJgAmwCjAKsAsQC5AL8AxQDLAM8A2ADdANsA4QDrAOsA6wDtAO4A8QD0APcA9wDxAO4A7wDqAOgA6QDmAOAA2wDYANYA0gDMAMoAxQC5ALEArACmAKIAnQCVAJAAiQB+AHgAdQBwAGYAYABaAFAARwBEAD4ANQAuACgAIwAfABgAEgANAAQA///9//T/6//p/+f/4v/a/9D/yv/H/8P/wP+9/7f/r/+q/6r/qf+j/6D/n/+Z/5X/l/+V/5P/k/+N/4r/jf+M/4z/kP+N/4v/j/+N/4v/jP+L/4//lP+T/5f/nv+g/6H/of+g/6P/pf+o/63/rf+r/6z/sv+4/7j/u//D/8f/xv/H/8f/yP/M/8//0v/V/9f/4v/o/+b/6v/t/+v/8v/5//z/AQADAAAABAAFAAMACAAMAA4AEQANAAwAEgASABIAGQAcABsAGwAXABIAEgASABAAEQATABUAFwAYABYAEQAMAAwAEAARABEAEgAPAAoACwAGAAAAAwAJAAcABQACAP///f/7//v//v8AAAIAAQD9//n/+v/5//r//f///wAAAgAEAAMAAQAAAP3//f8AAAAA/v8AAAQABQAEAAQABAAIAAsACQAIAAwADwAOABIAGAAbABoAGQAdACIAIQAkACsAKgAoACgAJgApADEAMAAwADMANAAyAC0AKAAsAC0ALAAvAC4ALgAxACwAJwAiABoAGwAbABcAHQAfABoAHAAYABEAEwAOAAYABwD+//T//f8AAPz//f/4//X/9P/t/+3/8//x/+//7//s/+z/7P/n/+b/6P/q/+3/6v/q/+3/6v/i/97/3v/n/+v/4f/e/+X/5//l/+T/5P/o/+f/4P/g/+j/7//2//b/8f/v//D/8f/3//3///8DAAcAAQD8//3/AAAGAAYAAAAAAAUABQADAAUACQANAAoAAwAGABIAGgAWAAsACQARABIACgAJABIAGQARAAAA/v8QABsADAD6//z/EAATAAEA+/8KABUACADz//L/BQAOAAIA+P///woABwD8//z/CgAUAAoA/f8EABoAFQAAAP3/DwAcABEA+//6/xEAHAAKAPf///8UABUAAQD6/wYAFAARAAMAAwAQABAACgAMABUAHAATAAUADwAeABYACgAKABQAHwAWAAIABAAYACIAGgALAAsAGQAcABUAFQAYABUADgAIAA0AEwANAAUABQAKAA0ABwD+/wAACQAHAP7/+f///wcAAgD7/wIACwAGAAAAAgAIAAsABgD+//3/BQAIAAQAAgACAAAAAAD9//r/+//1/+z/8v/7//b/7//t/+7/9//9//T/7P/y//v/+P/z//L/8v/3//r/9v/3//3/+P/2//z/+v/x/+3/8P/3//n/8v/u//D/8v/z//L/8P/s/+v/8f/y//H/9P/v/+P/5f/x//L/7//s/+r/6f/q/+7/8P/x//X/9v/x/+//7f/s//L/9v/y/+//7f/x//n/+v/3//b/9v/3//j/9P/0//n//P/5//b/+v8AAAAA/f/9//z/+/8AAAQAAQD+////AAD//wAAAgD6//b///8AAPz//P/9//3/AAAFAAgABgADAAQAAQD+/wEABAAAAAIACgAIAAQABwAGAAUABgAEAAIAAQD+////BAADAP3//P///wAAAAD+//r/+//+//7//f/+/wEAAQD+//v//P/9//3/AAACAPv/9v/5//v/+v/8//j/9P/0//X/9v/0//D/9//9//b/8//2//b/+P/7//j/9f/y//P/+P/7//7//v/4//P/9//5//j/9f/1//r/+f/y//T/+f/2//X/9v/1//b/9P/y//f/+f/5//n/9f/2//7////4//j/+//8////AAD9//3/AAAAAP7//v//////AAAAAPv/+v/9/wAAAgD+//n//f8AAP3//f////3///8CAAMAAgD///z/AAAGAAQAAgADAAIAAgAGAAMAAAAEAAkACAAFAAEAAwAKAAcAAgAHAAgAAgAGAAsACQAJAAYAAgAFAAgACQAMAAYABAANAAwABQAIAAwACQADAAAAAwAGAAUABQAFAAIA/f/6//z/AQADAAIAAQABAAQABwAAAPz/AAAFAAcACQAJAAcABgAHAAgABQAAAAAAAgABAAMABQAAAAEADAANAAgABAAAAP7/AAAEAAUA/v///wkABQD9/wAAAQADAAkABAABAAgACQAFAAQAAwAGAAcABQAIAAoACQAKAAMAAAAJAAsABgAGAAcABwAJAAUAAAADAAsADAD///v/CQAPAAYAAgADAAIAAQD9//z/AwAJAAkAAwD+/wAAAwABAAEAAQAAAAEAAAD//wAAAAAAAP7//P8BAAMA+v/5/wMABwABAP3//f8AAAIAAwADAAAAAAAFAAUAAAAAAAEA/v/+/wAA/P/4//3/AAD+//n/+f/7//z//f/8//n/+P/7//3//f/+//7/+//6//r///8CAP7/+f///wEAAAAAAP////8DAAEA/P///wIABAAEAP//+f/+/wcABAD9//3/AAD9//n//v8DAAEA/P/8/wIABQADAAAAAAAEAAgABQD+/wAACgAIAP3/+/8CAAMA/P/9/wEAAQAAAAAA+v/+/wUAAQD//wIABAAGAAcA///+/wMAAgAAAAAA/f/7//r//P8CAAQAAAD+//v//v8DAP///P/+////BAAKAAM
" Your browser does not support the audio element.\n",
" </audio>\n",
" "
],
"text/plain": [
"<IPython.lib.display.Audio object>"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mel = AP.melspectrogram(wav)\n",
"print(\"Max:\", mel.max())\n",
"print(\"Min:\", mel.min())\n",
"print(\"Mean:\", mel.mean())\n",
"plot_spectrogram(mel.T, AP);\n",
"\n",
"wav_gen = AP.inv_melspectrogram(mel)\n",
"ipd.Audio(wav_gen, rate=AP.sample_rate)"
]
},
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false"
},
"source": [
"### Generate Linear-Spectrogram and Re-synthesis with GL"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"Collapsed": "false"
},
"outputs": [
{
"ename": "RuntimeError",
"evalue": " [!] Mean-Var stats does not match the given feature dimensions.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-18-91e8914b5c6a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mspec\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAP\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwav\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Max:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Min:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Mean:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mplot_spectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mAP\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m;\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/Projects/TTS/tts/utils/audio.py\u001b[0m in \u001b[0;36mspectrogram\u001b[0;34m(self, y)\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[0mD\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_stft\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0mS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_amp_to_db\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mabs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mD\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 220\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_normalize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mS\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 221\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmelspectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/Projects/TTS/tts/utils/audio.py\u001b[0m in \u001b[0;36m_normalize\u001b[0;34m(self, S)\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinear_scaler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mS\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 119\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' [!] Mean-Var stats does not match the given feature dimensions.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 120\u001b[0m \u001b[0;31m# range normalization\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[0mS\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mref_level_db\u001b[0m \u001b[0;31m# discard certain range of DB assuming it is air noise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mRuntimeError\u001b[0m: [!] Mean-Var stats does not match the given feature dimensions."
]
}
],
"source": [
"spec = AP.spectrogram(wav)\n",
"print(\"Max:\", spec.max())\n",
"print(\"Min:\", spec.min())\n",
"print(\"Mean:\", spec.mean())\n",
"plot_spectrogram(spec.T, AP);\n",
"\n",
"wav_gen = AP.inv_spectrogram(spec)\n",
"ipd.Audio(wav_gen, rate=AP.sample_rate)"
]
},
{
"cell_type": "markdown",
"metadata": {
"Collapsed": "false"
},
"source": [
"### Compare values for a certain parameter\n",
"\n",
"Optimize your parameters by comparing different values per parameter at a time."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"audio={\n",
" 'audio_processor': 'audio',\n",
" 'num_mels': 80, # In general, you don'tneed to change it \n",
" 'num_freq': 1025, # In general, you don'tneed to change it \n",
" 'sample_rate': 22050, # It depends to the sample rate of the dataset.\n",
" 'frame_length_ms': 50, # In general, you don'tneed to change it \n",
" 'frame_shift_ms': 12.5, # In general, you don'tneed to change it \n",
" 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99.\n",
" 'min_level_db': -100,\n",
" 'ref_level_db': 20, # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below.\n",
" 'power': 1.5, # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values.\n",
" 'griffin_lim_iters': 60, # It does not give any imporvement for values > 60\n",
" 'signal_norm': True, # This is more about your model. It does not give any change for the synthsis performance.\n",
" 'symmetric_norm': False, # Same as above\n",
" 'max_norm': 1, # Same as above\n",
" 'clip_norm': True, # Same as above\n",
" 'mel_fmin': 0.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n",
" 'mel_fmax': 8000.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n",
" 'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
"\n",
"AP = AudioProcessor(**audio);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"from librosa import display\n",
"from matplotlib import pylab as plt\n",
"import IPython\n",
"plt.rcParams['figure.figsize'] = (20.0, 16.0)\n",
"\n",
"def compare_values(attribute, values, file):\n",
" \"\"\"\n",
" attributes (str): the names of the attribute you like to test.\n",
" values (list): list of values to compare.\n",
" file (str): file name to perform the tests.\n",
" \"\"\"\n",
" wavs = []\n",
" for idx, val in enumerate(values):\n",
" set_val_cmd = \"AP.{}={}\".format(attribute, val)\n",
" exec(set_val_cmd)\n",
" wav = AP.load_wav(file)\n",
" spec = AP.spectrogram(wav)\n",
" spec_norm = AP._denormalize(spec.T)\n",
" plt.subplot(len(values), 2, 2*idx + 1)\n",
" plt.imshow(spec_norm.T, aspect=\"auto\", origin=\"lower\")\n",
" # plt.colorbar()\n",
" plt.tight_layout()\n",
" wav_gen = AP.inv_spectrogram(spec)\n",
" wavs.append(wav_gen)\n",
" plt.subplot(len(values), 2, 2*idx + 2)\n",
" display.waveplot(wav, alpha=0.5)\n",
" display.waveplot(wav_gen, alpha=0.25)\n",
" plt.title(\"{}={}\".format(attribute, val))\n",
" plt.tight_layout()\n",
" \n",
" wav = AP.load_wav(file)\n",
" print(\" > Ground-truth\")\n",
" IPython.display.display(IPython.display.Audio(wav, rate=AP.sample_rate))\n",
" \n",
" for idx, wav_gen in enumerate(wavs):\n",
" val = values[idx]\n",
" print(\" > {} = {}\".format(attribute, val))\n",
" IPython.display.display(IPython.display.Audio(wav_gen, rate=AP.sample_rate))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99], file_paths[10])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"Collapsed": "false"
},
"outputs": [],
"source": [
"compare_values(\"ref_level_db\", [10, 15, 20, 25, 30, 35, 40], file_paths[10])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}