给TTS模块增加VCTK数据继续训练模型

This commit is contained in:
liuhaozhe6788
2023-03-29 16:02:41 +08:00
parent b5296a9a36
commit b77e01b96d
12 changed files with 225 additions and 96 deletions

3
.gitignore vendored
View File

@@ -21,4 +21,5 @@ launch.json
*.pt
*.whl
*.m4a
*.png
*.png
log/

View File

@@ -70,7 +70,7 @@ python synthesizer_train.py <model_id> <datasets_root>/SV2TTS/synthesizer
```
if you want to monitor the training progress, run
```
python update_plot.py syn
tensorboard --logdir log/synthesizer --host localhost --port 8088
```
### Vocoder
@@ -89,7 +89,7 @@ python vocoder_train.py <model_id> <datasets_root>
```
if you want to monitor the training progress, run
```
python update_plot.py voc
tensorboard --logdir log/vocoder --host localhost --port 8080
```
**Note:**

View File

@@ -157,10 +157,9 @@ if __name__ == '__main__':
# - If the wav is already loaded:
# get duration info from input audio
# message2 = "Reference voice: enter an audio folder of a voice to be cloned (mp3, " \
# f"wav, m4a, flac, ...):({i+1}/{num_of_input_audio})\n"
# in_fpath = Path(input(message2).replace("\"", "").replace("\'", ""))
in_fpath = Path("/home/liuhaozhe/voice_cloning_project/collected_audios/openvoice_official/mellow.mp3")
message2 = "Reference voice: enter an audio folder of a voice to be cloned (mp3, " \
f"wav, m4a, flac, ...):({i+1}/{num_of_input_audio})\n"
in_fpath = Path(input(message2).replace("\"", "").replace("\'", ""))
fpath_without_ext = os.path.splitext(str(in_fpath))[0]
speaker_name = os.path.normpath(fpath_without_ext).split(os.sep)[-1]

Binary file not shown.

View File

@@ -49,12 +49,12 @@ hparams = HParams(
# frame that has all values < -3.4
### Tacotron Training
tts_schedule = [(2, 1e-3, 20_000, 12), # Progressive training schedule
(2, 5e-4, 40_000, 12), # (r, lr, step, batch_size)
(2, 2e-4, 80_000, 12), #
(2, 1e-4, 160_000, 12), # r = reduction factor (# of mel frames
(2, 3e-5, 320_000, 12), # synthesized for each decoder iteration)
(2, 1e-5, 640_000, 12)], # lr = learning rate
tts_schedule = [(2, 1e-3, 40_000, 12), # Progressive training schedule
(2, 5e-4, 80_000, 12), # (r, lr, step, batch_size)
(2, 2e-4, 160_000, 12), #
(2, 1e-4, 320_000, 12), # r = reduction factor (# of mel frames
(2, 3e-5, 1280_000, 12), # synthesized for each decoder iteration)
(2, 1e-5, 2560_000, 12)], # lr = learning rate
tts_clip_grad_norm = 1.0, # clips the gradient norm to prevent explosion - set to None if not needed
tts_eval_interval = 100, # Number of steps between model evaluation (sample generation)
@@ -80,12 +80,11 @@ hparams = HParams(
use_lws = False, # "Fast spectrogram phase recovery using local weighted sums"
symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True,
# and [0, max_abs_value] if False
trim_silence = True, # Use with sample_rate of 16000 for best results
### SV2TTS
speaker_embedding_size = 256, # Dimension for the speaker embedding
silence_min_duration_split = 0.4, # Duration in seconds of a silence for an utterance to be split
utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded
utterance_min_duration = 1, # Duration in seconds below which utterances are discarded
)
def hparams_debug_string():

View File

@@ -1,18 +1,20 @@
from multiprocessing.pool import Pool
from synthesizer import audio
from functools import partial
from itertools import chain
from itertools import chain, groupby
from encoder import inference as encoder
from pathlib import Path
from utils import logmmse
from tqdm import tqdm
import numpy as np
import librosa
import random
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams,
no_alignments: bool, datasets_name: str, subfolders: str):
# Gather the input directories
def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams,
datasets_name: str, subfolders: str, no_alignments=False):
# Gather the input directories of LibriSpeeech
dataset_root = datasets_root.joinpath(datasets_name)
input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")]
print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
@@ -22,7 +24,7 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, ski
dev_input_dirs = input_dirs[-1: ]
# Create the output directories for each output file type
train_out_dir = out_dir.joinpath("train-clean")
train_out_dir = out_dir.joinpath("train")
train_out_dir.mkdir(exist_ok=True)
train_out_dir.joinpath("mels").mkdir(exist_ok=True)
train_out_dir.joinpath("audio").mkdir(exist_ok=True)
@@ -31,7 +33,7 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, ski
train_metadata_fpath = train_out_dir.joinpath("train.txt")
train_metadata_file = train_metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
dev_out_dir = out_dir.joinpath("dev-clean")
dev_out_dir = out_dir.joinpath("dev")
dev_out_dir.mkdir(exist_ok=True)
dev_out_dir.joinpath("mels").mkdir(exist_ok=True)
dev_out_dir.joinpath("audio").mkdir(exist_ok=True)
@@ -87,6 +89,49 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, ski
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
def preprocess_vctk(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams,
datasets_name: str, subfolders: str, no_alignments=True):
# TODO:Gather the input directories of VCTK
dataset_root = datasets_root.joinpath(datasets_name)
input_dir = dataset_root.joinpath(subfolders)
print("Using data from:" + str(input_dir))
assert input_dir.exists()
paths = [*input_dir.rglob("*.flac")]
# train dev audio data split
train_input_fpaths = []
dev_input_fpaths = []
pairs = sorted([(p.parts[-2].split('_')[0], p) for p in paths])
del paths
for _, group in groupby(pairs, lambda pair: pair[0]):
paths = sorted([p for _, p in group if "mic1.flac" in str(p)]) # only get mic1 flac file
random.seed(0)
random.shuffle(paths)
n = round(len(paths) * 0.9)
train_input_fpaths.extend(paths[:n])
# dev dataset has the same speakers as train dataset
dev_input_fpaths.extend(paths[n:])
# Create the output directories for each output file type
train_out_dir = out_dir.joinpath("train")
train_out_dir.mkdir(exist_ok=True)
train_out_dir.joinpath("mels").mkdir(exist_ok=True)
train_out_dir.joinpath("audio").mkdir(exist_ok=True)
dev_out_dir = out_dir.joinpath("dev")
dev_out_dir.mkdir(exist_ok=True)
dev_out_dir.joinpath("mels").mkdir(exist_ok=True)
dev_out_dir.joinpath("audio").mkdir(exist_ok=True)
# Preprocess the train dataset
preprocess_data(train_input_fpaths, mode="train", out_dir=train_out_dir, skip_existing=skip_existing, hparams=hparams, no_alignments=no_alignments)
# Preprocess the dev dataset
preprocess_data(dev_input_fpaths, mode="dev", out_dir=dev_out_dir, skip_existing=skip_existing, hparams=hparams, no_alignments=no_alignments)
def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
metadata = []
for book_dir in speaker_dir.glob("*"):
@@ -146,6 +191,51 @@ def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams,
return [m for m in metadata if m is not None]
def preprocess_data(wav_fpaths, mode, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
assert mode in ["train", "dev"]
# Create a metadata file
metadata_fpath = out_dir.joinpath(f"{mode}.txt")
metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
if no_alignments:
for wav_fpath in tqdm(wav_fpaths, desc=mode):
# Load the audio waveform
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
# Get the corresponding text
# Check for .txt (for compatibility with other datasets)
base_name = "_".join(wav_fpath.name.split(".")[0].split("_")[: -1]) + ".txt"
text_fpath = wav_fpath.with_name(base_name)
if not text_fpath.exists():
continue
with text_fpath.open("r") as text_file:
text = "".join([line for line in text_file])
text = text.replace("\"", "")
text = text.strip()
# Process the utterance
metadata = process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name), skip_existing, hparams, trim_silence=False)
if metadata is not None:
metadata_file.write("|".join(str(x) for x in metadata) + "\n")
metadata_file.close()
# Verify the contents of the metadata file
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
metadata = [line.split("|") for line in metadata_file]
mel_frames = sum([int(m[4]) for m in metadata])
timesteps = sum([int(m[3]) for m in metadata])
sample_rate = hparams.sample_rate
hours = (timesteps / sample_rate) / 3600
print(f"The {mode} dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
(len(metadata), mel_frames, timesteps, hours))
print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
def split_on_silences(wav_fpath, words, end_times, hparams):
# Load the audio waveform
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
@@ -219,7 +309,7 @@ def split_on_silences(wav_fpath, words, end_times, hparams):
def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
skip_existing: bool, hparams):
skip_existing: bool, hparams, trim_silence=True):
## FOR REFERENCE:
# For you not to lose your head if you ever wish to change things here or implement your own
# synthesizer.
@@ -240,8 +330,7 @@ def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
return None
# Trim silence
if hparams.trim_silence:
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=trim_silence)
# Skip utterances that are too short
if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
@@ -277,10 +366,10 @@ def embed_utterance(fpaths, encoder_model_fpath):
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
# create train embeddings
train_wav_dir = synthesizer_root.joinpath("train-clean/audio")
train_metadata_fpath = synthesizer_root.joinpath("train-clean/train.txt")
train_wav_dir = synthesizer_root.joinpath("train/audio")
train_metadata_fpath = synthesizer_root.joinpath("train/train.txt")
assert train_wav_dir.exists() and train_metadata_fpath.exists()
train_embed_dir = synthesizer_root.joinpath("train-clean/embeds")
train_embed_dir = synthesizer_root.joinpath("train/embeds")
train_embed_dir.mkdir(exist_ok=True)
# Gather the input wave filepath and the target output embed filepath
@@ -295,10 +384,10 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
# create dev embeddings
dev_wav_dir = synthesizer_root.joinpath("dev-clean/audio")
dev_metadata_fpath = synthesizer_root.joinpath("dev-clean/dev.txt")
dev_wav_dir = synthesizer_root.joinpath("dev/audio")
dev_metadata_fpath = synthesizer_root.joinpath("dev/dev.txt")
assert dev_wav_dir.exists() and dev_metadata_fpath.exists()
dev_embed_dir = synthesizer_root.joinpath("dev-clean/embeds")
dev_embed_dir = synthesizer_root.joinpath("dev/embeds")
dev_embed_dir.mkdir(exist_ok=True)
# Gather the input wave filepath and the target output embed filepath

View File

@@ -16,10 +16,10 @@ from synthesizer.utils.symbols import symbols
def run_synthesis(in_dir: Path, out_dir: Path, syn_model_fpath: Path, hparams):
# This generates ground truth-aligned mels for vocoder training
train_in_dir = in_dir.joinpath("train-clean")
train_out_dir = out_dir.joinpath("train-clean")
dev_in_dir = in_dir.joinpath("dev-clean")
dev_out_dir = out_dir.joinpath("dev-clean")
train_in_dir = in_dir.joinpath("train")
train_out_dir = out_dir.joinpath("train")
dev_in_dir = in_dir.joinpath("dev")
dev_out_dir = out_dir.joinpath("dev")
train_synth_dir = train_out_dir / "mels_gta"
train_synth_dir.mkdir(exist_ok=True, parents=True)
dev_synth_dir = dev_out_dir / "mels_gta"

View File

@@ -16,7 +16,6 @@ from synthesizer.utils.plot import plot_spectrogram
from synthesizer.utils.symbols import symbols
from synthesizer.utils.text import sequence_to_text
from vocoder.display import *
from utils.profiler import Profiler
def np_now(x: torch.Tensor): return x.detach().cpu().numpy()
@@ -32,8 +31,15 @@ def sync(device: torch.device):
torch.cuda.synchronize(device)
def train(run_id: str, syn_dir: Path, models_dir: Path, save_every: int, backup_every: int, force_restart: bool,
def train(run_id: str, syn_dir: Path, models_dir: Path, save_every: int, backup_every: int, force_restart: bool, use_tb: bool,
hparams):
if use_tb:
print("Use Tensorboard")
import tensorflow as tf
import datetime
# Hide GPU from visible devices
log_dir = f"log/synthesizer/tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_summary_writer = tf.summary.create_file_writer(log_dir)
models_dir.mkdir(exist_ok=True)
model_dir = models_dir.joinpath(run_id)
@@ -48,8 +54,8 @@ def train(run_id: str, syn_dir: Path, models_dir: Path, save_every: int, backup
meta_folder.mkdir(exist_ok=True)
weights_fpath = model_dir / f"synthesizer.pt"
train_metadata_fpath = syn_dir.joinpath("train-clean/train.txt")
dev_metadata_fpath = syn_dir.joinpath("dev-clean/dev.txt")
train_metadata_fpath = syn_dir.joinpath("train/train.txt")
dev_metadata_fpath = syn_dir.joinpath("dev/dev.txt")
print("Checkpoint path: {}".format(weights_fpath))
print("Loading training data from: {}".format(train_metadata_fpath))
@@ -91,11 +97,11 @@ def train(run_id: str, syn_dir: Path, models_dir: Path, save_every: int, backup
# Initialize the optimizer
optimizer = optim.Adam(model.parameters())
train_loss_file_path = "synthesizer_loss/synthesizer_train_loss.npy"
dev_loss_file_path = "synthesizer_loss/synthesizer_dev_loss.npy"
if not exists("synthesizer_loss"):
import os
os.mkdir("synthesizer_loss")
# train_loss_file_path = "synthesizer_loss/synthesizer_train_loss.npy"
# dev_loss_file_path = "synthesizer_loss/synthesizer_dev_loss.npy"
# if not exists("synthesizer_loss"):
# import os
# os.mkdir("synthesizer_loss")
# Load the weights
if force_restart or not weights_fpath.exists():
@@ -111,26 +117,26 @@ def train(run_id: str, syn_dir: Path, models_dir: Path, save_every: int, backup
f.write("{}\n".format(symbol))
losses = []
dev_losses = []
# losses = []
# dev_losses = []
else:
print("\nLoading weights at %s" % weights_fpath)
model.load(weights_fpath, optimizer)
print("Tacotron weights loaded from step %d" % model.step)
losses = list(np.load(train_loss_file_path)) if exists(train_loss_file_path) else []
dev_losses = list(np.load(dev_loss_file_path)) if exists(dev_loss_file_path) else []
# losses = list(np.load(train_loss_file_path)) if exists(train_loss_file_path) else []
# dev_losses = list(np.load(dev_loss_file_path)) if exists(dev_loss_file_path) else []
# Initialize the dataset
train_mel_dir = syn_dir.joinpath("train-clean/mels")
train_embed_dir = syn_dir.joinpath("train-clean/embeds")
dev_mel_dir = syn_dir.joinpath("dev-clean/mels")
dev_embed_dir = syn_dir.joinpath("dev-clean/embeds")
train_mel_dir = syn_dir.joinpath("train/mels")
train_embed_dir = syn_dir.joinpath("train/embeds")
dev_mel_dir = syn_dir.joinpath("dev/mels")
dev_embed_dir = syn_dir.joinpath("dev/embeds")
train_dataset = SynthesizerDataset(train_metadata_fpath, train_mel_dir, train_embed_dir, hparams)
dev_dataset = SynthesizerDataset(dev_metadata_fpath, dev_mel_dir, dev_embed_dir, hparams)
best_loss_file_path = "synthesizer_loss/best_loss.npy"
best_loss = np.load(best_loss_file_path)[0] if exists(best_loss_file_path) else 1000
# best_loss_file_path = "synthesizer_loss/best_loss.npy"
# best_loss = np.load(best_loss_file_path)[0] if exists(best_loss_file_path) else 1000
# profiler = Profiler(summarize_every=10, disabled=False)
for i, session in enumerate(hparams.tts_schedule):
@@ -230,28 +236,36 @@ def train(run_id: str, syn_dir: Path, models_dir: Path, save_every: int, backup
f"{1./time_window.average:#.2} steps/s | Step: {k}k | "
stream(msg)
if use_tb:
with train_summary_writer.as_default():
tf.summary.scalar('train_loss', loss_window.average, step=step)
tf.summary.scalar('learning_rate', lr, step=step)
# Backup or save model as appropriate
# if backup_every != 0 and step % backup_every == 0 :
# backup_fpath = weights_fpath.parent / f"synthesizer_{k:06d}.pt"
# model.save(backup_fpath, optimizer)
torch.cuda.empty_cache()
if save_every != 0 and i % save_every == 0:
dev_loss = validate(dev_dataset, model, collate_fn)
msg = f"\n| Epoch: {epoch}/{epochs} ({i}/{steps_per_epoch}) | Train Loss: {loss_window.average:#.4} | " \
f"Dev Loss: {dev_loss:#.4} | {1./time_window.average:#.2} steps/s | Step: {k}k | "
print(msg)
losses.append(loss_window.average)
np.save(train_loss_file_path, np.array(losses, dtype=float))
dev_losses.append(dev_loss)
np.save(dev_loss_file_path, np.array(dev_losses, dtype=float))
if use_tb:
with train_summary_writer.as_default():
tf.summary.scalar('val_loss', dev_loss, step=step)
# losses.append(loss_window.average)
# np.save(train_loss_file_path, np.array(losses, dtype=float))
if dev_loss < best_loss:
# Must save latest optimizer state to ensure that resuming training
# doesn't produce artifacts
best_loss = dev_loss
np.save(best_loss_file_path, np.array([best_loss]))
model.save(weights_fpath, optimizer)
# dev_losses.append(dev_loss)
# np.save(dev_loss_file_path, np.array(dev_losses, dtype=float))
# Must save latest optimizer state to ensure that resuming training
# doesn't produce artifacts
# best_loss = dev_loss
# np.save(best_loss_file_path, np.array([best_loss]))
model.save(weights_fpath, optimizer)
# Evaluate model to generate dev samples
# epoch_eval = hparams.tts_eval_interval == -1 and i == steps_per_epoch # If epoch is done

View File

@@ -1,4 +1,4 @@
from synthesizer.preprocess import preprocess_dataset
from synthesizer.preprocess import preprocess_librispeech, preprocess_vctk
from synthesizer.hparams import hparams
from utils.argutils import print_args
from pathlib import Path
@@ -24,12 +24,9 @@ if __name__ == "__main__":
"interrupted.")
parser.add_argument("--hparams", type=str, default="", help=\
"Hyperparameter overrides as a comma-separated list of name-value pairs")
parser.add_argument("--no_alignments", action="store_true", help=\
"Use this option when dataset does not include alignments\
(these are used to split long audio files into sub-utterances.)")
parser.add_argument("--datasets_name", type=str, default="LibriSpeech", help=\
parser.add_argument("--datasets_names", type=list, default=["LibriSpeech","VCTK"], help=\
"Name of the dataset directory to process.")
parser.add_argument("--subfolders", type=str, default="train-clean-100,train-clean-360,dev-clean", help=\
parser.add_argument("--all_subfolders", type=list, default=["train-clean-100,train-clean-360,dev-clean", "wav48_silence_trimmed"], help=\
"Comma-separated list of subfolders to process inside your dataset directory")
args = parser.parse_args()
@@ -44,4 +41,15 @@ if __name__ == "__main__":
# Preprocess the dataset
print_args(args, parser)
args.hparams = hparams.parse(args.hparams)
preprocess_dataset(**vars(args))
preprocess_func = {
"LibriSpeech": preprocess_librispeech,
"VCTK": preprocess_vctk,
}
args = vars(args)
for i in range(len(args["datasets_names"])):
dataset = args["datasets_names"][i]
subfolders = args["all_subfolders"][i]
print("Preprocessing %s" % dataset)
preprocess_func[dataset](datasets_root=args["datasets_root"], out_dir=args["out_dir"], n_processes=args["n_processes"], skip_existing=args["skip_existing"], hparams=args["hparams"],
datasets_name=dataset, subfolders=subfolders)

View File

@@ -17,7 +17,7 @@ if __name__ == "__main__":
"the wavs and the embeds.")
parser.add_argument("-m", "--models_dir", type=Path, default="saved_models", help=\
"Path to the output directory that will contain the saved model weights and the logs.")
parser.add_argument("-s", "--save_every", type=int, default=100, help= \
parser.add_argument("-s", "--save_every", type=int, default=1000, help= \
"Number of steps between updates of the model on the disk. Set to 0 to never save the "
"model.")
parser.add_argument("-b", "--backup_every", type=int, default=25000, help= \
@@ -25,6 +25,8 @@ if __name__ == "__main__":
"model.")
parser.add_argument("-f", "--force_restart", action="store_true", help= \
"Do not load any saved model and restart from scratch.")
parser.add_argument("--use_tb", action="store_true", help= \
"Use Tensorboard support")
parser.add_argument("--hparams", default="", help=\
"Hyperparameter overrides as a comma-separated list of name=value pairs")
args = parser.parse_args()

View File

@@ -19,12 +19,19 @@ from utils.profiler import Profiler
def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_truth: bool, save_every: int,
backup_every: int, force_restart: bool):
backup_every: int, force_restart: bool, use_tb: bool):
if use_tb:
print("Use Tensorboard")
import tensorflow as tf
import datetime
# Hide GPU from visible devices
log_dir = f"log/vocoder/tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_summary_writer = tf.summary.create_file_writer(log_dir)
# Check to make sure the hop length is correctly factorised
train_syn_dir = syn_dir.joinpath("train-clean")
train_voc_dir = voc_dir.joinpath("train-clean")
dev_syn_dir = syn_dir.joinpath("dev-clean")
dev_voc_dir = voc_dir.joinpath("dev-clean")
train_syn_dir = syn_dir.joinpath("train")
train_voc_dir = voc_dir.joinpath("train")
dev_syn_dir = syn_dir.joinpath("dev")
dev_voc_dir = voc_dir.joinpath("dev")
assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length
# Instantiate the model
@@ -58,23 +65,23 @@ def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_tr
model_dir = models_dir / run_id
model_dir.mkdir(exist_ok=True)
weights_fpath = model_dir / "vocoder.pt"
train_loss_file_path = "vocoder_loss/vocoder_train_loss.npy"
dev_loss_file_path = "vocoder_loss/vocoder_dev_loss.npy"
# train_loss_file_path = "vocoder_loss/vocoder_train_loss.npy"
# dev_loss_file_path = "vocoder_loss/vocoder_dev_loss.npy"
if not exists("vocoder_loss"):
import os
os.mkdir("vocoder_loss")
# if not exists("vocoder_loss"):
# import os
# os.mkdir("vocoder_loss")
if force_restart or not weights_fpath.exists():
print("\nStarting the training of WaveRNN from scratch\n")
model.save(weights_fpath, optimizer)
losses = []
dev_losses = []
# losses = []
# dev_losses = []
else:
print("\nLoading weights at %s" % weights_fpath)
model.load(weights_fpath, optimizer)
print("WaveRNN weights loaded from step %d" % model.step)
losses = list(np.load(train_loss_file_path)) if exists(train_loss_file_path) else []
dev_losses = list(np.load(dev_loss_file_path)) if exists(dev_loss_file_path) else []
# losses = list(np.load(train_loss_file_path)) if exists(train_loss_file_path) else []
# dev_losses = list(np.load(dev_loss_file_path)) if exists(dev_loss_file_path) else []
# Initialize the dataset
train_metadata_fpath = train_syn_dir.joinpath("train.txt") if ground_truth else \
@@ -96,8 +103,8 @@ def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_tr
simple_table([('Batch size', hp.voc_batch_size),
('LR', hp.voc_lr),
('Sequence Len', hp.voc_seq_len)])
best_loss_file_path = "vocoder_loss/best_loss.npy"
best_loss = np.load(best_loss_file_path)[0] if exists(best_loss_file_path) else 1000
# best_loss_file_path = "vocoder_loss/best_loss.npy"
# best_loss = np.load(best_loss_file_path)[0] if exists(best_loss_file_path) else 1000
# profiler = Profiler(summarize_every=10, disabled=False)
for epoch in range(1, 350):
@@ -139,6 +146,10 @@ def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_tr
f"{speed:.4f}steps/s | Step: {k}k | "
stream(msg)
if use_tb:
with train_summary_writer.as_default():
tf.summary.scalar('train_loss', train_loss_window.average, step=step)
if backup_every != 0 and i % backup_every == 0 :
model.checkpoint(model_dir, optimizer)
@@ -148,14 +159,18 @@ def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_tr
f"Train Loss: {train_loss_window.average:.4f} | Dev Loss: {dev_loss:.4f} | " \
f"{speed:.4f}steps/s | Step: {k}k | "
stream(msg)
losses.append(train_loss_window.average)
np.save(train_loss_file_path, np.array(losses, dtype=float))
dev_losses.append(dev_loss)
np.save(dev_loss_file_path, np.array(dev_losses, dtype=float))
if dev_loss < best_loss :
best_loss = dev_loss
np.save(best_loss_file_path, np.array([best_loss]))
model.save(weights_fpath, optimizer)
if use_tb:
with train_summary_writer.as_default():
tf.summary.scalar('val_loss', dev_loss, step=step)
# losses.append(train_loss_window.average)
# np.save(train_loss_file_path, np.array(losses, dtype=float))
# dev_losses.append(dev_loss)
# np.save(dev_loss_file_path, np.array(dev_losses, dtype=float))
# if dev_loss < best_loss :
# best_loss = dev_loss
# np.save(best_loss_file_path, np.array([best_loss]))
model.save(weights_fpath, optimizer)
# profiler.tick("Extra saving")

View File

@@ -38,6 +38,8 @@ if __name__ == "__main__":
"model.")
parser.add_argument("-f", "--force_restart", action="store_true", help= \
"Do not load any saved model and restart from scratch.")
parser.add_argument("--use_tb", action="store_true", help= \
"Use Tensorboard support")
args = parser.parse_args()
# Process the arguments