mirror of
https://github.com/liuhaozhe6788/voice-cloning-collab.git
synced 2026-05-18 05:04:51 +02:00
给TTS模块增加VCTK数据继续训练模型
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -21,4 +21,5 @@ launch.json
|
||||
*.pt
|
||||
*.whl
|
||||
*.m4a
|
||||
*.png
|
||||
*.png
|
||||
log/
|
||||
@@ -70,7 +70,7 @@ python synthesizer_train.py <model_id> <datasets_root>/SV2TTS/synthesizer
|
||||
```
|
||||
if you want to monitor the training progress, run
|
||||
```
|
||||
python update_plot.py syn
|
||||
tensorboard --logdir log/synthesizer --host localhost --port 8088
|
||||
```
|
||||
### Vocoder
|
||||
|
||||
@@ -89,7 +89,7 @@ python vocoder_train.py <model_id> <datasets_root>
|
||||
```
|
||||
if you want to monitor the training progress, run
|
||||
```
|
||||
python update_plot.py voc
|
||||
tensorboard --logdir log/vocoder --host localhost --port 8080
|
||||
```
|
||||
**Note:**
|
||||
|
||||
|
||||
@@ -157,10 +157,9 @@ if __name__ == '__main__':
|
||||
# - If the wav is already loaded:
|
||||
|
||||
# get duration info from input audio
|
||||
# message2 = "Reference voice: enter an audio folder of a voice to be cloned (mp3, " \
|
||||
# f"wav, m4a, flac, ...):({i+1}/{num_of_input_audio})\n"
|
||||
# in_fpath = Path(input(message2).replace("\"", "").replace("\'", ""))
|
||||
in_fpath = Path("/home/liuhaozhe/voice_cloning_project/collected_audios/openvoice_official/mellow.mp3")
|
||||
message2 = "Reference voice: enter an audio folder of a voice to be cloned (mp3, " \
|
||||
f"wav, m4a, flac, ...):({i+1}/{num_of_input_audio})\n"
|
||||
in_fpath = Path(input(message2).replace("\"", "").replace("\'", ""))
|
||||
|
||||
fpath_without_ext = os.path.splitext(str(in_fpath))[0]
|
||||
speaker_name = os.path.normpath(fpath_without_ext).split(os.sep)[-1]
|
||||
|
||||
BIN
requirements.txt
BIN
requirements.txt
Binary file not shown.
@@ -49,12 +49,12 @@ hparams = HParams(
|
||||
# frame that has all values < -3.4
|
||||
|
||||
### Tacotron Training
|
||||
tts_schedule = [(2, 1e-3, 20_000, 12), # Progressive training schedule
|
||||
(2, 5e-4, 40_000, 12), # (r, lr, step, batch_size)
|
||||
(2, 2e-4, 80_000, 12), #
|
||||
(2, 1e-4, 160_000, 12), # r = reduction factor (# of mel frames
|
||||
(2, 3e-5, 320_000, 12), # synthesized for each decoder iteration)
|
||||
(2, 1e-5, 640_000, 12)], # lr = learning rate
|
||||
tts_schedule = [(2, 1e-3, 40_000, 12), # Progressive training schedule
|
||||
(2, 5e-4, 80_000, 12), # (r, lr, step, batch_size)
|
||||
(2, 2e-4, 160_000, 12), #
|
||||
(2, 1e-4, 320_000, 12), # r = reduction factor (# of mel frames
|
||||
(2, 3e-5, 1280_000, 12), # synthesized for each decoder iteration)
|
||||
(2, 1e-5, 2560_000, 12)], # lr = learning rate
|
||||
|
||||
tts_clip_grad_norm = 1.0, # clips the gradient norm to prevent explosion - set to None if not needed
|
||||
tts_eval_interval = 100, # Number of steps between model evaluation (sample generation)
|
||||
@@ -80,12 +80,11 @@ hparams = HParams(
|
||||
use_lws = False, # "Fast spectrogram phase recovery using local weighted sums"
|
||||
symmetric_mels = True, # Sets mel range to [-max_abs_value, max_abs_value] if True,
|
||||
# and [0, max_abs_value] if False
|
||||
trim_silence = True, # Use with sample_rate of 16000 for best results
|
||||
|
||||
### SV2TTS
|
||||
speaker_embedding_size = 256, # Dimension for the speaker embedding
|
||||
silence_min_duration_split = 0.4, # Duration in seconds of a silence for an utterance to be split
|
||||
utterance_min_duration = 1.6, # Duration in seconds below which utterances are discarded
|
||||
utterance_min_duration = 1, # Duration in seconds below which utterances are discarded
|
||||
)
|
||||
|
||||
def hparams_debug_string():
|
||||
|
||||
@@ -1,18 +1,20 @@
|
||||
from multiprocessing.pool import Pool
|
||||
from synthesizer import audio
|
||||
from functools import partial
|
||||
from itertools import chain
|
||||
from itertools import chain, groupby
|
||||
from encoder import inference as encoder
|
||||
from pathlib import Path
|
||||
from utils import logmmse
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
import librosa
|
||||
import random
|
||||
|
||||
|
||||
def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams,
|
||||
no_alignments: bool, datasets_name: str, subfolders: str):
|
||||
# Gather the input directories
|
||||
def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams,
|
||||
datasets_name: str, subfolders: str, no_alignments=False):
|
||||
|
||||
# Gather the input directories of LibriSpeeech
|
||||
dataset_root = datasets_root.joinpath(datasets_name)
|
||||
input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")]
|
||||
print("\n ".join(map(str, ["Using data from:"] + input_dirs)))
|
||||
@@ -22,7 +24,7 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, ski
|
||||
dev_input_dirs = input_dirs[-1: ]
|
||||
|
||||
# Create the output directories for each output file type
|
||||
train_out_dir = out_dir.joinpath("train-clean")
|
||||
train_out_dir = out_dir.joinpath("train")
|
||||
train_out_dir.mkdir(exist_ok=True)
|
||||
train_out_dir.joinpath("mels").mkdir(exist_ok=True)
|
||||
train_out_dir.joinpath("audio").mkdir(exist_ok=True)
|
||||
@@ -31,7 +33,7 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, ski
|
||||
train_metadata_fpath = train_out_dir.joinpath("train.txt")
|
||||
train_metadata_file = train_metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
|
||||
|
||||
dev_out_dir = out_dir.joinpath("dev-clean")
|
||||
dev_out_dir = out_dir.joinpath("dev")
|
||||
dev_out_dir.mkdir(exist_ok=True)
|
||||
dev_out_dir.joinpath("mels").mkdir(exist_ok=True)
|
||||
dev_out_dir.joinpath("audio").mkdir(exist_ok=True)
|
||||
@@ -87,6 +89,49 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, ski
|
||||
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
|
||||
|
||||
|
||||
def preprocess_vctk(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams,
|
||||
datasets_name: str, subfolders: str, no_alignments=True):
|
||||
# TODO:Gather the input directories of VCTK
|
||||
dataset_root = datasets_root.joinpath(datasets_name)
|
||||
input_dir = dataset_root.joinpath(subfolders)
|
||||
print("Using data from:" + str(input_dir))
|
||||
assert input_dir.exists()
|
||||
paths = [*input_dir.rglob("*.flac")]
|
||||
|
||||
# train dev audio data split
|
||||
train_input_fpaths = []
|
||||
dev_input_fpaths = []
|
||||
|
||||
pairs = sorted([(p.parts[-2].split('_')[0], p) for p in paths])
|
||||
del paths
|
||||
|
||||
for _, group in groupby(pairs, lambda pair: pair[0]):
|
||||
paths = sorted([p for _, p in group if "mic1.flac" in str(p)]) # only get mic1 flac file
|
||||
random.seed(0)
|
||||
random.shuffle(paths)
|
||||
n = round(len(paths) * 0.9)
|
||||
train_input_fpaths.extend(paths[:n])
|
||||
# dev dataset has the same speakers as train dataset
|
||||
dev_input_fpaths.extend(paths[n:])
|
||||
|
||||
# Create the output directories for each output file type
|
||||
train_out_dir = out_dir.joinpath("train")
|
||||
train_out_dir.mkdir(exist_ok=True)
|
||||
train_out_dir.joinpath("mels").mkdir(exist_ok=True)
|
||||
train_out_dir.joinpath("audio").mkdir(exist_ok=True)
|
||||
|
||||
dev_out_dir = out_dir.joinpath("dev")
|
||||
dev_out_dir.mkdir(exist_ok=True)
|
||||
dev_out_dir.joinpath("mels").mkdir(exist_ok=True)
|
||||
dev_out_dir.joinpath("audio").mkdir(exist_ok=True)
|
||||
|
||||
# Preprocess the train dataset
|
||||
preprocess_data(train_input_fpaths, mode="train", out_dir=train_out_dir, skip_existing=skip_existing, hparams=hparams, no_alignments=no_alignments)
|
||||
|
||||
# Preprocess the dev dataset
|
||||
preprocess_data(dev_input_fpaths, mode="dev", out_dir=dev_out_dir, skip_existing=skip_existing, hparams=hparams, no_alignments=no_alignments)
|
||||
|
||||
|
||||
def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
|
||||
metadata = []
|
||||
for book_dir in speaker_dir.glob("*"):
|
||||
@@ -146,6 +191,51 @@ def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams,
|
||||
return [m for m in metadata if m is not None]
|
||||
|
||||
|
||||
def preprocess_data(wav_fpaths, mode, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
|
||||
assert mode in ["train", "dev"]
|
||||
# Create a metadata file
|
||||
metadata_fpath = out_dir.joinpath(f"{mode}.txt")
|
||||
metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
|
||||
if no_alignments:
|
||||
for wav_fpath in tqdm(wav_fpaths, desc=mode):
|
||||
# Load the audio waveform
|
||||
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
|
||||
if hparams.rescale:
|
||||
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
||||
|
||||
# Get the corresponding text
|
||||
# Check for .txt (for compatibility with other datasets)
|
||||
base_name = "_".join(wav_fpath.name.split(".")[0].split("_")[: -1]) + ".txt"
|
||||
text_fpath = wav_fpath.with_name(base_name)
|
||||
|
||||
if not text_fpath.exists():
|
||||
continue
|
||||
with text_fpath.open("r") as text_file:
|
||||
text = "".join([line for line in text_file])
|
||||
text = text.replace("\"", "")
|
||||
text = text.strip()
|
||||
|
||||
# Process the utterance
|
||||
metadata = process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name), skip_existing, hparams, trim_silence=False)
|
||||
|
||||
if metadata is not None:
|
||||
metadata_file.write("|".join(str(x) for x in metadata) + "\n")
|
||||
metadata_file.close()
|
||||
|
||||
# Verify the contents of the metadata file
|
||||
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
|
||||
metadata = [line.split("|") for line in metadata_file]
|
||||
mel_frames = sum([int(m[4]) for m in metadata])
|
||||
timesteps = sum([int(m[3]) for m in metadata])
|
||||
sample_rate = hparams.sample_rate
|
||||
hours = (timesteps / sample_rate) / 3600
|
||||
print(f"The {mode} dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
|
||||
(len(metadata), mel_frames, timesteps, hours))
|
||||
print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
|
||||
print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
|
||||
print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
|
||||
|
||||
|
||||
def split_on_silences(wav_fpath, words, end_times, hparams):
|
||||
# Load the audio waveform
|
||||
wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
|
||||
@@ -219,7 +309,7 @@ def split_on_silences(wav_fpath, words, end_times, hparams):
|
||||
|
||||
|
||||
def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
||||
skip_existing: bool, hparams):
|
||||
skip_existing: bool, hparams, trim_silence=True):
|
||||
## FOR REFERENCE:
|
||||
# For you not to lose your head if you ever wish to change things here or implement your own
|
||||
# synthesizer.
|
||||
@@ -240,8 +330,7 @@ def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
||||
return None
|
||||
|
||||
# Trim silence
|
||||
if hparams.trim_silence:
|
||||
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
|
||||
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=trim_silence)
|
||||
|
||||
# Skip utterances that are too short
|
||||
if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
|
||||
@@ -277,10 +366,10 @@ def embed_utterance(fpaths, encoder_model_fpath):
|
||||
|
||||
def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
|
||||
# create train embeddings
|
||||
train_wav_dir = synthesizer_root.joinpath("train-clean/audio")
|
||||
train_metadata_fpath = synthesizer_root.joinpath("train-clean/train.txt")
|
||||
train_wav_dir = synthesizer_root.joinpath("train/audio")
|
||||
train_metadata_fpath = synthesizer_root.joinpath("train/train.txt")
|
||||
assert train_wav_dir.exists() and train_metadata_fpath.exists()
|
||||
train_embed_dir = synthesizer_root.joinpath("train-clean/embeds")
|
||||
train_embed_dir = synthesizer_root.joinpath("train/embeds")
|
||||
train_embed_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Gather the input wave filepath and the target output embed filepath
|
||||
@@ -295,10 +384,10 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
|
||||
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
|
||||
|
||||
# create dev embeddings
|
||||
dev_wav_dir = synthesizer_root.joinpath("dev-clean/audio")
|
||||
dev_metadata_fpath = synthesizer_root.joinpath("dev-clean/dev.txt")
|
||||
dev_wav_dir = synthesizer_root.joinpath("dev/audio")
|
||||
dev_metadata_fpath = synthesizer_root.joinpath("dev/dev.txt")
|
||||
assert dev_wav_dir.exists() and dev_metadata_fpath.exists()
|
||||
dev_embed_dir = synthesizer_root.joinpath("dev-clean/embeds")
|
||||
dev_embed_dir = synthesizer_root.joinpath("dev/embeds")
|
||||
dev_embed_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Gather the input wave filepath and the target output embed filepath
|
||||
|
||||
@@ -16,10 +16,10 @@ from synthesizer.utils.symbols import symbols
|
||||
|
||||
def run_synthesis(in_dir: Path, out_dir: Path, syn_model_fpath: Path, hparams):
|
||||
# This generates ground truth-aligned mels for vocoder training
|
||||
train_in_dir = in_dir.joinpath("train-clean")
|
||||
train_out_dir = out_dir.joinpath("train-clean")
|
||||
dev_in_dir = in_dir.joinpath("dev-clean")
|
||||
dev_out_dir = out_dir.joinpath("dev-clean")
|
||||
train_in_dir = in_dir.joinpath("train")
|
||||
train_out_dir = out_dir.joinpath("train")
|
||||
dev_in_dir = in_dir.joinpath("dev")
|
||||
dev_out_dir = out_dir.joinpath("dev")
|
||||
train_synth_dir = train_out_dir / "mels_gta"
|
||||
train_synth_dir.mkdir(exist_ok=True, parents=True)
|
||||
dev_synth_dir = dev_out_dir / "mels_gta"
|
||||
|
||||
@@ -16,7 +16,6 @@ from synthesizer.utils.plot import plot_spectrogram
|
||||
from synthesizer.utils.symbols import symbols
|
||||
from synthesizer.utils.text import sequence_to_text
|
||||
from vocoder.display import *
|
||||
from utils.profiler import Profiler
|
||||
|
||||
|
||||
def np_now(x: torch.Tensor): return x.detach().cpu().numpy()
|
||||
@@ -32,8 +31,15 @@ def sync(device: torch.device):
|
||||
torch.cuda.synchronize(device)
|
||||
|
||||
|
||||
def train(run_id: str, syn_dir: Path, models_dir: Path, save_every: int, backup_every: int, force_restart: bool,
|
||||
def train(run_id: str, syn_dir: Path, models_dir: Path, save_every: int, backup_every: int, force_restart: bool, use_tb: bool,
|
||||
hparams):
|
||||
if use_tb:
|
||||
print("Use Tensorboard")
|
||||
import tensorflow as tf
|
||||
import datetime
|
||||
# Hide GPU from visible devices
|
||||
log_dir = f"log/synthesizer/tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
train_summary_writer = tf.summary.create_file_writer(log_dir)
|
||||
models_dir.mkdir(exist_ok=True)
|
||||
|
||||
model_dir = models_dir.joinpath(run_id)
|
||||
@@ -48,8 +54,8 @@ def train(run_id: str, syn_dir: Path, models_dir: Path, save_every: int, backup
|
||||
meta_folder.mkdir(exist_ok=True)
|
||||
|
||||
weights_fpath = model_dir / f"synthesizer.pt"
|
||||
train_metadata_fpath = syn_dir.joinpath("train-clean/train.txt")
|
||||
dev_metadata_fpath = syn_dir.joinpath("dev-clean/dev.txt")
|
||||
train_metadata_fpath = syn_dir.joinpath("train/train.txt")
|
||||
dev_metadata_fpath = syn_dir.joinpath("dev/dev.txt")
|
||||
|
||||
print("Checkpoint path: {}".format(weights_fpath))
|
||||
print("Loading training data from: {}".format(train_metadata_fpath))
|
||||
@@ -91,11 +97,11 @@ def train(run_id: str, syn_dir: Path, models_dir: Path, save_every: int, backup
|
||||
# Initialize the optimizer
|
||||
optimizer = optim.Adam(model.parameters())
|
||||
|
||||
train_loss_file_path = "synthesizer_loss/synthesizer_train_loss.npy"
|
||||
dev_loss_file_path = "synthesizer_loss/synthesizer_dev_loss.npy"
|
||||
if not exists("synthesizer_loss"):
|
||||
import os
|
||||
os.mkdir("synthesizer_loss")
|
||||
# train_loss_file_path = "synthesizer_loss/synthesizer_train_loss.npy"
|
||||
# dev_loss_file_path = "synthesizer_loss/synthesizer_dev_loss.npy"
|
||||
# if not exists("synthesizer_loss"):
|
||||
# import os
|
||||
# os.mkdir("synthesizer_loss")
|
||||
|
||||
# Load the weights
|
||||
if force_restart or not weights_fpath.exists():
|
||||
@@ -111,26 +117,26 @@ def train(run_id: str, syn_dir: Path, models_dir: Path, save_every: int, backup
|
||||
|
||||
f.write("{}\n".format(symbol))
|
||||
|
||||
losses = []
|
||||
dev_losses = []
|
||||
# losses = []
|
||||
# dev_losses = []
|
||||
|
||||
else:
|
||||
print("\nLoading weights at %s" % weights_fpath)
|
||||
model.load(weights_fpath, optimizer)
|
||||
print("Tacotron weights loaded from step %d" % model.step)
|
||||
losses = list(np.load(train_loss_file_path)) if exists(train_loss_file_path) else []
|
||||
dev_losses = list(np.load(dev_loss_file_path)) if exists(dev_loss_file_path) else []
|
||||
# losses = list(np.load(train_loss_file_path)) if exists(train_loss_file_path) else []
|
||||
# dev_losses = list(np.load(dev_loss_file_path)) if exists(dev_loss_file_path) else []
|
||||
|
||||
# Initialize the dataset
|
||||
train_mel_dir = syn_dir.joinpath("train-clean/mels")
|
||||
train_embed_dir = syn_dir.joinpath("train-clean/embeds")
|
||||
dev_mel_dir = syn_dir.joinpath("dev-clean/mels")
|
||||
dev_embed_dir = syn_dir.joinpath("dev-clean/embeds")
|
||||
train_mel_dir = syn_dir.joinpath("train/mels")
|
||||
train_embed_dir = syn_dir.joinpath("train/embeds")
|
||||
dev_mel_dir = syn_dir.joinpath("dev/mels")
|
||||
dev_embed_dir = syn_dir.joinpath("dev/embeds")
|
||||
train_dataset = SynthesizerDataset(train_metadata_fpath, train_mel_dir, train_embed_dir, hparams)
|
||||
dev_dataset = SynthesizerDataset(dev_metadata_fpath, dev_mel_dir, dev_embed_dir, hparams)
|
||||
|
||||
best_loss_file_path = "synthesizer_loss/best_loss.npy"
|
||||
best_loss = np.load(best_loss_file_path)[0] if exists(best_loss_file_path) else 1000
|
||||
# best_loss_file_path = "synthesizer_loss/best_loss.npy"
|
||||
# best_loss = np.load(best_loss_file_path)[0] if exists(best_loss_file_path) else 1000
|
||||
|
||||
# profiler = Profiler(summarize_every=10, disabled=False)
|
||||
for i, session in enumerate(hparams.tts_schedule):
|
||||
@@ -230,28 +236,36 @@ def train(run_id: str, syn_dir: Path, models_dir: Path, save_every: int, backup
|
||||
f"{1./time_window.average:#.2} steps/s | Step: {k}k | "
|
||||
stream(msg)
|
||||
|
||||
if use_tb:
|
||||
with train_summary_writer.as_default():
|
||||
tf.summary.scalar('train_loss', loss_window.average, step=step)
|
||||
tf.summary.scalar('learning_rate', lr, step=step)
|
||||
|
||||
# Backup or save model as appropriate
|
||||
# if backup_every != 0 and step % backup_every == 0 :
|
||||
# backup_fpath = weights_fpath.parent / f"synthesizer_{k:06d}.pt"
|
||||
# model.save(backup_fpath, optimizer)
|
||||
|
||||
torch.cuda.empty_cache()
|
||||
if save_every != 0 and i % save_every == 0:
|
||||
dev_loss = validate(dev_dataset, model, collate_fn)
|
||||
msg = f"\n| Epoch: {epoch}/{epochs} ({i}/{steps_per_epoch}) | Train Loss: {loss_window.average:#.4} | " \
|
||||
f"Dev Loss: {dev_loss:#.4} | {1./time_window.average:#.2} steps/s | Step: {k}k | "
|
||||
print(msg)
|
||||
losses.append(loss_window.average)
|
||||
np.save(train_loss_file_path, np.array(losses, dtype=float))
|
||||
|
||||
dev_losses.append(dev_loss)
|
||||
np.save(dev_loss_file_path, np.array(dev_losses, dtype=float))
|
||||
if use_tb:
|
||||
with train_summary_writer.as_default():
|
||||
tf.summary.scalar('val_loss', dev_loss, step=step)
|
||||
# losses.append(loss_window.average)
|
||||
# np.save(train_loss_file_path, np.array(losses, dtype=float))
|
||||
|
||||
if dev_loss < best_loss:
|
||||
# Must save latest optimizer state to ensure that resuming training
|
||||
# doesn't produce artifacts
|
||||
best_loss = dev_loss
|
||||
np.save(best_loss_file_path, np.array([best_loss]))
|
||||
model.save(weights_fpath, optimizer)
|
||||
# dev_losses.append(dev_loss)
|
||||
# np.save(dev_loss_file_path, np.array(dev_losses, dtype=float))
|
||||
|
||||
# Must save latest optimizer state to ensure that resuming training
|
||||
# doesn't produce artifacts
|
||||
# best_loss = dev_loss
|
||||
# np.save(best_loss_file_path, np.array([best_loss]))
|
||||
model.save(weights_fpath, optimizer)
|
||||
|
||||
# Evaluate model to generate dev samples
|
||||
# epoch_eval = hparams.tts_eval_interval == -1 and i == steps_per_epoch # If epoch is done
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from synthesizer.preprocess import preprocess_dataset
|
||||
from synthesizer.preprocess import preprocess_librispeech, preprocess_vctk
|
||||
from synthesizer.hparams import hparams
|
||||
from utils.argutils import print_args
|
||||
from pathlib import Path
|
||||
@@ -24,12 +24,9 @@ if __name__ == "__main__":
|
||||
"interrupted.")
|
||||
parser.add_argument("--hparams", type=str, default="", help=\
|
||||
"Hyperparameter overrides as a comma-separated list of name-value pairs")
|
||||
parser.add_argument("--no_alignments", action="store_true", help=\
|
||||
"Use this option when dataset does not include alignments\
|
||||
(these are used to split long audio files into sub-utterances.)")
|
||||
parser.add_argument("--datasets_name", type=str, default="LibriSpeech", help=\
|
||||
parser.add_argument("--datasets_names", type=list, default=["LibriSpeech","VCTK"], help=\
|
||||
"Name of the dataset directory to process.")
|
||||
parser.add_argument("--subfolders", type=str, default="train-clean-100,train-clean-360,dev-clean", help=\
|
||||
parser.add_argument("--all_subfolders", type=list, default=["train-clean-100,train-clean-360,dev-clean", "wav48_silence_trimmed"], help=\
|
||||
"Comma-separated list of subfolders to process inside your dataset directory")
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -44,4 +41,15 @@ if __name__ == "__main__":
|
||||
# Preprocess the dataset
|
||||
print_args(args, parser)
|
||||
args.hparams = hparams.parse(args.hparams)
|
||||
preprocess_dataset(**vars(args))
|
||||
preprocess_func = {
|
||||
"LibriSpeech": preprocess_librispeech,
|
||||
"VCTK": preprocess_vctk,
|
||||
}
|
||||
args = vars(args)
|
||||
for i in range(len(args["datasets_names"])):
|
||||
dataset = args["datasets_names"][i]
|
||||
subfolders = args["all_subfolders"][i]
|
||||
print("Preprocessing %s" % dataset)
|
||||
|
||||
preprocess_func[dataset](datasets_root=args["datasets_root"], out_dir=args["out_dir"], n_processes=args["n_processes"], skip_existing=args["skip_existing"], hparams=args["hparams"],
|
||||
datasets_name=dataset, subfolders=subfolders)
|
||||
|
||||
@@ -17,7 +17,7 @@ if __name__ == "__main__":
|
||||
"the wavs and the embeds.")
|
||||
parser.add_argument("-m", "--models_dir", type=Path, default="saved_models", help=\
|
||||
"Path to the output directory that will contain the saved model weights and the logs.")
|
||||
parser.add_argument("-s", "--save_every", type=int, default=100, help= \
|
||||
parser.add_argument("-s", "--save_every", type=int, default=1000, help= \
|
||||
"Number of steps between updates of the model on the disk. Set to 0 to never save the "
|
||||
"model.")
|
||||
parser.add_argument("-b", "--backup_every", type=int, default=25000, help= \
|
||||
@@ -25,6 +25,8 @@ if __name__ == "__main__":
|
||||
"model.")
|
||||
parser.add_argument("-f", "--force_restart", action="store_true", help= \
|
||||
"Do not load any saved model and restart from scratch.")
|
||||
parser.add_argument("--use_tb", action="store_true", help= \
|
||||
"Use Tensorboard support")
|
||||
parser.add_argument("--hparams", default="", help=\
|
||||
"Hyperparameter overrides as a comma-separated list of name=value pairs")
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -19,12 +19,19 @@ from utils.profiler import Profiler
|
||||
|
||||
|
||||
def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_truth: bool, save_every: int,
|
||||
backup_every: int, force_restart: bool):
|
||||
backup_every: int, force_restart: bool, use_tb: bool):
|
||||
if use_tb:
|
||||
print("Use Tensorboard")
|
||||
import tensorflow as tf
|
||||
import datetime
|
||||
# Hide GPU from visible devices
|
||||
log_dir = f"log/vocoder/tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
train_summary_writer = tf.summary.create_file_writer(log_dir)
|
||||
# Check to make sure the hop length is correctly factorised
|
||||
train_syn_dir = syn_dir.joinpath("train-clean")
|
||||
train_voc_dir = voc_dir.joinpath("train-clean")
|
||||
dev_syn_dir = syn_dir.joinpath("dev-clean")
|
||||
dev_voc_dir = voc_dir.joinpath("dev-clean")
|
||||
train_syn_dir = syn_dir.joinpath("train")
|
||||
train_voc_dir = voc_dir.joinpath("train")
|
||||
dev_syn_dir = syn_dir.joinpath("dev")
|
||||
dev_voc_dir = voc_dir.joinpath("dev")
|
||||
assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length
|
||||
|
||||
# Instantiate the model
|
||||
@@ -58,23 +65,23 @@ def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_tr
|
||||
model_dir = models_dir / run_id
|
||||
model_dir.mkdir(exist_ok=True)
|
||||
weights_fpath = model_dir / "vocoder.pt"
|
||||
train_loss_file_path = "vocoder_loss/vocoder_train_loss.npy"
|
||||
dev_loss_file_path = "vocoder_loss/vocoder_dev_loss.npy"
|
||||
# train_loss_file_path = "vocoder_loss/vocoder_train_loss.npy"
|
||||
# dev_loss_file_path = "vocoder_loss/vocoder_dev_loss.npy"
|
||||
|
||||
if not exists("vocoder_loss"):
|
||||
import os
|
||||
os.mkdir("vocoder_loss")
|
||||
# if not exists("vocoder_loss"):
|
||||
# import os
|
||||
# os.mkdir("vocoder_loss")
|
||||
if force_restart or not weights_fpath.exists():
|
||||
print("\nStarting the training of WaveRNN from scratch\n")
|
||||
model.save(weights_fpath, optimizer)
|
||||
losses = []
|
||||
dev_losses = []
|
||||
# losses = []
|
||||
# dev_losses = []
|
||||
else:
|
||||
print("\nLoading weights at %s" % weights_fpath)
|
||||
model.load(weights_fpath, optimizer)
|
||||
print("WaveRNN weights loaded from step %d" % model.step)
|
||||
losses = list(np.load(train_loss_file_path)) if exists(train_loss_file_path) else []
|
||||
dev_losses = list(np.load(dev_loss_file_path)) if exists(dev_loss_file_path) else []
|
||||
# losses = list(np.load(train_loss_file_path)) if exists(train_loss_file_path) else []
|
||||
# dev_losses = list(np.load(dev_loss_file_path)) if exists(dev_loss_file_path) else []
|
||||
|
||||
# Initialize the dataset
|
||||
train_metadata_fpath = train_syn_dir.joinpath("train.txt") if ground_truth else \
|
||||
@@ -96,8 +103,8 @@ def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_tr
|
||||
simple_table([('Batch size', hp.voc_batch_size),
|
||||
('LR', hp.voc_lr),
|
||||
('Sequence Len', hp.voc_seq_len)])
|
||||
best_loss_file_path = "vocoder_loss/best_loss.npy"
|
||||
best_loss = np.load(best_loss_file_path)[0] if exists(best_loss_file_path) else 1000
|
||||
# best_loss_file_path = "vocoder_loss/best_loss.npy"
|
||||
# best_loss = np.load(best_loss_file_path)[0] if exists(best_loss_file_path) else 1000
|
||||
|
||||
# profiler = Profiler(summarize_every=10, disabled=False)
|
||||
for epoch in range(1, 350):
|
||||
@@ -139,6 +146,10 @@ def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_tr
|
||||
f"{speed:.4f}steps/s | Step: {k}k | "
|
||||
stream(msg)
|
||||
|
||||
if use_tb:
|
||||
with train_summary_writer.as_default():
|
||||
tf.summary.scalar('train_loss', train_loss_window.average, step=step)
|
||||
|
||||
if backup_every != 0 and i % backup_every == 0 :
|
||||
model.checkpoint(model_dir, optimizer)
|
||||
|
||||
@@ -148,14 +159,18 @@ def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_tr
|
||||
f"Train Loss: {train_loss_window.average:.4f} | Dev Loss: {dev_loss:.4f} | " \
|
||||
f"{speed:.4f}steps/s | Step: {k}k | "
|
||||
stream(msg)
|
||||
losses.append(train_loss_window.average)
|
||||
np.save(train_loss_file_path, np.array(losses, dtype=float))
|
||||
dev_losses.append(dev_loss)
|
||||
np.save(dev_loss_file_path, np.array(dev_losses, dtype=float))
|
||||
if dev_loss < best_loss :
|
||||
best_loss = dev_loss
|
||||
np.save(best_loss_file_path, np.array([best_loss]))
|
||||
model.save(weights_fpath, optimizer)
|
||||
|
||||
if use_tb:
|
||||
with train_summary_writer.as_default():
|
||||
tf.summary.scalar('val_loss', dev_loss, step=step)
|
||||
# losses.append(train_loss_window.average)
|
||||
# np.save(train_loss_file_path, np.array(losses, dtype=float))
|
||||
# dev_losses.append(dev_loss)
|
||||
# np.save(dev_loss_file_path, np.array(dev_losses, dtype=float))
|
||||
# if dev_loss < best_loss :
|
||||
# best_loss = dev_loss
|
||||
# np.save(best_loss_file_path, np.array([best_loss]))
|
||||
model.save(weights_fpath, optimizer)
|
||||
|
||||
# profiler.tick("Extra saving")
|
||||
|
||||
|
||||
@@ -38,6 +38,8 @@ if __name__ == "__main__":
|
||||
"model.")
|
||||
parser.add_argument("-f", "--force_restart", action="store_true", help= \
|
||||
"Do not load any saved model and restart from scratch.")
|
||||
parser.add_argument("--use_tb", action="store_true", help= \
|
||||
"Use Tensorboard support")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Process the arguments
|
||||
|
||||
Reference in New Issue
Block a user