给TTS模块增加VCTK数据继续训练模型

2026-05-18 05:04:51 +02:00 · 2023-03-29 16:02:41 +08:00
parent b5296a9a36
commit b77e01b96d
12 changed files with 225 additions and 96 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -21,4 +21,5 @@ launch.json
 *.pt
 *.whl
 *.m4a
-*.png
+*.png
+log/
--- a/README.md
+++ b/README.md
@@ -70,7 +70,7 @@ python synthesizer_train.py <model_id> <datasets_root>/SV2TTS/synthesizer
 ```
 if you want to monitor the training progress, run
 ```
-python update_plot.py syn
+tensorboard --logdir log/synthesizer --host localhost --port 8088
 ```
 ### Vocoder

@@ -89,7 +89,7 @@ python vocoder_train.py <model_id> <datasets_root>
 ```
 if you want to monitor the training progress, run
 ```
-python update_plot.py voc
+tensorboard --logdir log/vocoder --host localhost --port 8080
 ```
 **Note:**

--- a/demo_cli.py
+++ b/demo_cli.py
@@ -157,10 +157,9 @@ if __name__ == '__main__':
            # - If the wav is already loaded:

            # get duration info from input audio
-            # message2 = "Reference voice: enter an audio folder of a voice to be cloned (mp3, " \
-            #            f"wav, m4a, flac, ...):({i+1}/{num_of_input_audio})\n"
-            # in_fpath = Path(input(message2).replace("\"", "").replace("\'", ""))
-            in_fpath = Path("/home/liuhaozhe/voice_cloning_project/collected_audios/openvoice_official/mellow.mp3")
+            message2 = "Reference voice: enter an audio folder of a voice to be cloned (mp3, " \
+                       f"wav, m4a, flac, ...):({i+1}/{num_of_input_audio})\n"
+            in_fpath = Path(input(message2).replace("\"", "").replace("\'", ""))

            fpath_without_ext = os.path.splitext(str(in_fpath))[0]
            speaker_name = os.path.normpath(fpath_without_ext).split(os.sep)[-1]
--- a/requirements.txt
+++ b/requirements.txt
--- a/synthesizer/hparams.py
+++ b/synthesizer/hparams.py
@@ -49,12 +49,12 @@ hparams = HParams(
                                                    # frame that has all values < -3.4

        ### Tacotron Training
-        tts_schedule = [(2,  1e-3,  20_000,  12),   # Progressive training schedule
-                        (2,  5e-4,  40_000,  12),   # (r, lr, step, batch_size)
-                        (2,  2e-4,  80_000,  12),   #
-                        (2,  1e-4, 160_000,  12),   # r = reduction factor (# of mel frames
-                        (2,  3e-5, 320_000,  12),   #     synthesized for each decoder iteration)
-                        (2,  1e-5, 640_000,  12)],  # lr = learning rate
+        tts_schedule = [(2,  1e-3,  40_000,  12),   # Progressive training schedule
+                        (2,  5e-4,  80_000,  12),   # (r, lr, step, batch_size)
+                        (2,  2e-4,  160_000,  12),   #
+                        (2,  1e-4, 320_000,  12),   # r = reduction factor (# of mel frames
+                        (2,  3e-5, 1280_000,  12),   #     synthesized for each decoder iteration)
+                        (2,  1e-5, 2560_000,  12)],  # lr = learning rate

        tts_clip_grad_norm = 1.0,                   # clips the gradient norm to prevent explosion - set to None if not needed
        tts_eval_interval = 100,                    # Number of steps between model evaluation (sample generation)
@@ -80,12 +80,11 @@ hparams = HParams(
        use_lws = False,                            # "Fast spectrogram phase recovery using local weighted sums"
        symmetric_mels = True,                      # Sets mel range to [-max_abs_value, max_abs_value] if True,
                                                    #               and [0, max_abs_value] if False
-        trim_silence = True,                        # Use with sample_rate of 16000 for best results

        ### SV2TTS
        speaker_embedding_size = 256,               # Dimension for the speaker embedding
        silence_min_duration_split = 0.4,           # Duration in seconds of a silence for an utterance to be split
-        utterance_min_duration = 1.6,               # Duration in seconds below which utterances are discarded
+        utterance_min_duration = 1,                 # Duration in seconds below which utterances are discarded
        )

 def hparams_debug_string():
--- a/synthesizer/preprocess.py
+++ b/synthesizer/preprocess.py
@@ -1,18 +1,20 @@
 from multiprocessing.pool import Pool
 from synthesizer import audio
 from functools import partial
-from itertools import chain
+from itertools import chain, groupby
 from encoder import inference as encoder
 from pathlib import Path
 from utils import logmmse
 from tqdm import tqdm
 import numpy as np
 import librosa
+import random


-def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams,
-                       no_alignments: bool, datasets_name: str, subfolders: str):
-    # Gather the input directories
+def preprocess_librispeech(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams,
+                       datasets_name: str, subfolders: str, no_alignments=False):
+    
+    # Gather the input directories of LibriSpeeech
    dataset_root = datasets_root.joinpath(datasets_name)
    input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")]
    print("\n    ".join(map(str, ["Using data from:"] + input_dirs)))
@@ -22,7 +24,7 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, ski
    dev_input_dirs = input_dirs[-1: ]

    # Create the output directories for each output file type
-    train_out_dir = out_dir.joinpath("train-clean")
+    train_out_dir = out_dir.joinpath("train")
    train_out_dir.mkdir(exist_ok=True)
    train_out_dir.joinpath("mels").mkdir(exist_ok=True)
    train_out_dir.joinpath("audio").mkdir(exist_ok=True)
@@ -31,7 +33,7 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, ski
    train_metadata_fpath = train_out_dir.joinpath("train.txt")
    train_metadata_file = train_metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
    
-    dev_out_dir = out_dir.joinpath("dev-clean")
+    dev_out_dir = out_dir.joinpath("dev")
    dev_out_dir.mkdir(exist_ok=True)
    dev_out_dir.joinpath("mels").mkdir(exist_ok=True)
    dev_out_dir.joinpath("audio").mkdir(exist_ok=True)
@@ -87,6 +89,49 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, ski
    print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))


+def preprocess_vctk(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams,
+                       datasets_name: str, subfolders: str, no_alignments=True):
+    # TODO:Gather the input directories of VCTK
+    dataset_root = datasets_root.joinpath(datasets_name)
+    input_dir = dataset_root.joinpath(subfolders)
+    print("Using data from:" + str(input_dir))
+    assert input_dir.exists()
+    paths = [*input_dir.rglob("*.flac")]
+
+    # train dev audio data split
+    train_input_fpaths = []
+    dev_input_fpaths = []
+
+    pairs = sorted([(p.parts[-2].split('_')[0], p) for p in paths])
+    del paths
+
+    for _, group in groupby(pairs, lambda pair: pair[0]):
+        paths = sorted([p for _, p in group if "mic1.flac" in str(p)])  # only get mic1 flac file
+        random.seed(0)
+        random.shuffle(paths)
+        n = round(len(paths) * 0.9)
+        train_input_fpaths.extend(paths[:n])  
+        # dev dataset has the same speakers as train dataset      
+        dev_input_fpaths.extend(paths[n:]) 
+
+    # Create the output directories for each output file type
+    train_out_dir = out_dir.joinpath("train")
+    train_out_dir.mkdir(exist_ok=True)
+    train_out_dir.joinpath("mels").mkdir(exist_ok=True)
+    train_out_dir.joinpath("audio").mkdir(exist_ok=True)
+    
+    dev_out_dir = out_dir.joinpath("dev")
+    dev_out_dir.mkdir(exist_ok=True)
+    dev_out_dir.joinpath("mels").mkdir(exist_ok=True)
+    dev_out_dir.joinpath("audio").mkdir(exist_ok=True)
+
+    # Preprocess the train dataset
+    preprocess_data(train_input_fpaths, mode="train", out_dir=train_out_dir, skip_existing=skip_existing, hparams=hparams, no_alignments=no_alignments)
+    
+    # Preprocess the dev dataset
+    preprocess_data(dev_input_fpaths, mode="dev", out_dir=dev_out_dir, skip_existing=skip_existing, hparams=hparams, no_alignments=no_alignments)
+
+
 def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
    metadata = []
    for book_dir in speaker_dir.glob("*"):
@@ -146,6 +191,51 @@ def preprocess_speaker(speaker_dir, out_dir: Path, skip_existing: bool, hparams,
    return [m for m in metadata if m is not None]


+def preprocess_data(wav_fpaths, mode, out_dir: Path, skip_existing: bool, hparams, no_alignments: bool):
+    assert mode in ["train", "dev"]
+    # Create a metadata file
+    metadata_fpath = out_dir.joinpath(f"{mode}.txt")
+    metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
+    if no_alignments:
+        for wav_fpath in tqdm(wav_fpaths, desc=mode):
+            # Load the audio waveform
+            wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
+            if hparams.rescale:
+                wav = wav / np.abs(wav).max() * hparams.rescaling_max
+
+            # Get the corresponding text
+            # Check for .txt (for compatibility with other datasets)
+            base_name = "_".join(wav_fpath.name.split(".")[0].split("_")[: -1]) + ".txt"
+            text_fpath = wav_fpath.with_name(base_name)
+
+            if not text_fpath.exists():
+                continue
+            with text_fpath.open("r") as text_file:
+                text = "".join([line for line in text_file])
+                text = text.replace("\"", "")
+                text = text.strip()
+
+            # Process the utterance
+            metadata = process_utterance(wav, text, out_dir, str(wav_fpath.with_suffix("").name), skip_existing, hparams, trim_silence=False)
+
+            if metadata is not None:
+                metadata_file.write("|".join(str(x) for x in metadata) + "\n")
+    metadata_file.close()
+
+    # Verify the contents of the metadata file
+    with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
+        metadata = [line.split("|") for line in metadata_file]
+    mel_frames = sum([int(m[4]) for m in metadata])
+    timesteps = sum([int(m[3]) for m in metadata])
+    sample_rate = hparams.sample_rate
+    hours = (timesteps / sample_rate) / 3600
+    print(f"The {mode} dataset consists of %d utterances, %d mel frames, %d audio timesteps (%.2f hours)." %
+          (len(metadata), mel_frames, timesteps, hours))
+    print("Max input length (text chars): %d" % max(len(m[5]) for m in metadata))
+    print("Max mel frames length: %d" % max(int(m[4]) for m in metadata))
+    print("Max audio timesteps length: %d" % max(int(m[3]) for m in metadata))
+
+
 def split_on_silences(wav_fpath, words, end_times, hparams):
    # Load the audio waveform
    wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
@@ -219,7 +309,7 @@ def split_on_silences(wav_fpath, words, end_times, hparams):


 def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
-                      skip_existing: bool, hparams):
+                      skip_existing: bool, hparams, trim_silence=True):
    ## FOR REFERENCE:
    # For you not to lose your head if you ever wish to change things here or implement your own
    # synthesizer.
@@ -240,8 +330,7 @@ def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
        return None

    # Trim silence
-    if hparams.trim_silence:
-        wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
+    wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=trim_silence)

    # Skip utterances that are too short
    if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
@@ -277,10 +366,10 @@ def embed_utterance(fpaths, encoder_model_fpath):

 def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
    # create train embeddings
-    train_wav_dir = synthesizer_root.joinpath("train-clean/audio")
-    train_metadata_fpath = synthesizer_root.joinpath("train-clean/train.txt")
+    train_wav_dir = synthesizer_root.joinpath("train/audio")
+    train_metadata_fpath = synthesizer_root.joinpath("train/train.txt")
    assert train_wav_dir.exists() and train_metadata_fpath.exists()
-    train_embed_dir = synthesizer_root.joinpath("train-clean/embeds")
+    train_embed_dir = synthesizer_root.joinpath("train/embeds")
    train_embed_dir.mkdir(exist_ok=True)

    # Gather the input wave filepath and the target output embed filepath
@@ -295,10 +384,10 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
    list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))

    # create dev embeddings
-    dev_wav_dir = synthesizer_root.joinpath("dev-clean/audio")
-    dev_metadata_fpath = synthesizer_root.joinpath("dev-clean/dev.txt")
+    dev_wav_dir = synthesizer_root.joinpath("dev/audio")
+    dev_metadata_fpath = synthesizer_root.joinpath("dev/dev.txt")
    assert dev_wav_dir.exists() and dev_metadata_fpath.exists()
-    dev_embed_dir = synthesizer_root.joinpath("dev-clean/embeds")
+    dev_embed_dir = synthesizer_root.joinpath("dev/embeds")
    dev_embed_dir.mkdir(exist_ok=True)

    # Gather the input wave filepath and the target output embed filepath
--- a/synthesizer/synthesize.py
+++ b/synthesizer/synthesize.py
@@ -16,10 +16,10 @@ from synthesizer.utils.symbols import symbols

 def run_synthesis(in_dir: Path, out_dir: Path, syn_model_fpath: Path, hparams):
    # This generates ground truth-aligned mels for vocoder training
-    train_in_dir = in_dir.joinpath("train-clean")
-    train_out_dir = out_dir.joinpath("train-clean")
-    dev_in_dir = in_dir.joinpath("dev-clean")
-    dev_out_dir = out_dir.joinpath("dev-clean")
+    train_in_dir = in_dir.joinpath("train")
+    train_out_dir = out_dir.joinpath("train")
+    dev_in_dir = in_dir.joinpath("dev")
+    dev_out_dir = out_dir.joinpath("dev")
    train_synth_dir = train_out_dir / "mels_gta"
    train_synth_dir.mkdir(exist_ok=True, parents=True)
    dev_synth_dir = dev_out_dir / "mels_gta"
--- a/synthesizer/train.py
+++ b/synthesizer/train.py
@@ -16,7 +16,6 @@ from synthesizer.utils.plot import plot_spectrogram
 from synthesizer.utils.symbols import symbols
 from synthesizer.utils.text import sequence_to_text
 from vocoder.display import *
-from utils.profiler import Profiler


 def np_now(x: torch.Tensor): return x.detach().cpu().numpy()
@@ -32,8 +31,15 @@ def sync(device: torch.device):
        torch.cuda.synchronize(device)


-def train(run_id: str, syn_dir: Path, models_dir: Path, save_every: int,  backup_every: int, force_restart: bool,
+def train(run_id: str, syn_dir: Path, models_dir: Path, save_every: int,  backup_every: int, force_restart: bool, use_tb: bool, 
          hparams):
+    if use_tb:
+        print("Use Tensorboard")
+        import tensorflow as tf
+        import datetime
+        # Hide GPU from visible devices
+        log_dir = f"log/synthesizer/tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+        train_summary_writer = tf.summary.create_file_writer(log_dir)
    models_dir.mkdir(exist_ok=True)

    model_dir = models_dir.joinpath(run_id)
@@ -48,8 +54,8 @@ def train(run_id: str, syn_dir: Path, models_dir: Path, save_every: int,  backup
    meta_folder.mkdir(exist_ok=True)

    weights_fpath = model_dir / f"synthesizer.pt"
-    train_metadata_fpath = syn_dir.joinpath("train-clean/train.txt")
-    dev_metadata_fpath = syn_dir.joinpath("dev-clean/dev.txt")
+    train_metadata_fpath = syn_dir.joinpath("train/train.txt")
+    dev_metadata_fpath = syn_dir.joinpath("dev/dev.txt")

    print("Checkpoint path: {}".format(weights_fpath))
    print("Loading training data from: {}".format(train_metadata_fpath))
@@ -91,11 +97,11 @@ def train(run_id: str, syn_dir: Path, models_dir: Path, save_every: int,  backup
    # Initialize the optimizer
    optimizer = optim.Adam(model.parameters())
    
-    train_loss_file_path = "synthesizer_loss/synthesizer_train_loss.npy"
-    dev_loss_file_path = "synthesizer_loss/synthesizer_dev_loss.npy"
-    if not exists("synthesizer_loss"):
-        import os
-        os.mkdir("synthesizer_loss")
+    # train_loss_file_path = "synthesizer_loss/synthesizer_train_loss.npy"
+    # dev_loss_file_path = "synthesizer_loss/synthesizer_dev_loss.npy"
+    # if not exists("synthesizer_loss"):
+    #     import os
+    #     os.mkdir("synthesizer_loss")
    
    # Load the weights
    if force_restart or not weights_fpath.exists():
@@ -111,26 +117,26 @@ def train(run_id: str, syn_dir: Path, models_dir: Path, save_every: int,  backup

                f.write("{}\n".format(symbol))
                
-        losses = []
-        dev_losses = []
+        # losses = []
+        # dev_losses = []

    else:
        print("\nLoading weights at %s" % weights_fpath)
        model.load(weights_fpath, optimizer)
        print("Tacotron weights loaded from step %d" % model.step)
-        losses = list(np.load(train_loss_file_path)) if exists(train_loss_file_path) else []
-        dev_losses = list(np.load(dev_loss_file_path)) if exists(dev_loss_file_path) else []
+        # losses = list(np.load(train_loss_file_path)) if exists(train_loss_file_path) else []
+        # dev_losses = list(np.load(dev_loss_file_path)) if exists(dev_loss_file_path) else []
        
    # Initialize the dataset
-    train_mel_dir = syn_dir.joinpath("train-clean/mels")
-    train_embed_dir = syn_dir.joinpath("train-clean/embeds")
-    dev_mel_dir = syn_dir.joinpath("dev-clean/mels")
-    dev_embed_dir = syn_dir.joinpath("dev-clean/embeds")
+    train_mel_dir = syn_dir.joinpath("train/mels")
+    train_embed_dir = syn_dir.joinpath("train/embeds")
+    dev_mel_dir = syn_dir.joinpath("dev/mels")
+    dev_embed_dir = syn_dir.joinpath("dev/embeds")
    train_dataset = SynthesizerDataset(train_metadata_fpath, train_mel_dir, train_embed_dir, hparams)
    dev_dataset = SynthesizerDataset(dev_metadata_fpath, dev_mel_dir, dev_embed_dir, hparams)

-    best_loss_file_path = "synthesizer_loss/best_loss.npy"
-    best_loss = np.load(best_loss_file_path)[0] if exists(best_loss_file_path) else 1000
+    # best_loss_file_path = "synthesizer_loss/best_loss.npy"
+    # best_loss = np.load(best_loss_file_path)[0] if exists(best_loss_file_path) else 1000

    # profiler = Profiler(summarize_every=10, disabled=False)
    for i, session in enumerate(hparams.tts_schedule):
@@ -230,28 +236,36 @@ def train(run_id: str, syn_dir: Path, models_dir: Path, save_every: int,  backup
                      f"{1./time_window.average:#.2} steps/s | Step: {k}k | "
                stream(msg)

+                if use_tb:
+                    with train_summary_writer.as_default():
+                        tf.summary.scalar('train_loss', loss_window.average, step=step)
+                        tf.summary.scalar('learning_rate', lr, step=step)
+
                # Backup or save model as appropriate
                # if backup_every != 0 and step % backup_every == 0 :
                #     backup_fpath = weights_fpath.parent / f"synthesizer_{k:06d}.pt"
                #     model.save(backup_fpath, optimizer)
-
+                torch.cuda.empty_cache()
                if save_every != 0 and i % save_every == 0:
                    dev_loss = validate(dev_dataset, model, collate_fn)
                    msg = f"\n| Epoch: {epoch}/{epochs} ({i}/{steps_per_epoch}) | Train Loss: {loss_window.average:#.4} | " \
                          f"Dev Loss: {dev_loss:#.4} | {1./time_window.average:#.2} steps/s | Step: {k}k | "
                    print(msg)
-                    losses.append(loss_window.average)
-                    np.save(train_loss_file_path, np.array(losses, dtype=float))

-                    dev_losses.append(dev_loss)
-                    np.save(dev_loss_file_path, np.array(dev_losses, dtype=float))
+                    if use_tb:
+                        with train_summary_writer.as_default():
+                            tf.summary.scalar('val_loss', dev_loss, step=step)
+                    # losses.append(loss_window.average)
+                    # np.save(train_loss_file_path, np.array(losses, dtype=float))

-                    if dev_loss < best_loss:
-                        # Must save latest optimizer state to ensure that resuming training
-                        # doesn't produce artifacts
-                        best_loss = dev_loss
-                        np.save(best_loss_file_path, np.array([best_loss]))
-                        model.save(weights_fpath, optimizer)
+                    # dev_losses.append(dev_loss)
+                    # np.save(dev_loss_file_path, np.array(dev_losses, dtype=float))
+
+                    # Must save latest optimizer state to ensure that resuming training
+                    # doesn't produce artifacts
+                    # best_loss = dev_loss
+                    # np.save(best_loss_file_path, np.array([best_loss]))
+                    model.save(weights_fpath, optimizer)

                # Evaluate model to generate dev samples
                # epoch_eval = hparams.tts_eval_interval == -1 and i == steps_per_epoch  # If epoch is done
--- a/synthesizer_preprocess_audio.py
+++ b/synthesizer_preprocess_audio.py
@@ -1,4 +1,4 @@
-from synthesizer.preprocess import preprocess_dataset
+from synthesizer.preprocess import preprocess_librispeech, preprocess_vctk
 from synthesizer.hparams import hparams
 from utils.argutils import print_args
 from pathlib import Path
@@ -24,12 +24,9 @@ if __name__ == "__main__":
        "interrupted.")
    parser.add_argument("--hparams", type=str, default="", help=\
        "Hyperparameter overrides as a comma-separated list of name-value pairs")
-    parser.add_argument("--no_alignments", action="store_true", help=\
-        "Use this option when dataset does not include alignments\
-        (these are used to split long audio files into sub-utterances.)")
-    parser.add_argument("--datasets_name", type=str, default="LibriSpeech", help=\
+    parser.add_argument("--datasets_names", type=list, default=["LibriSpeech","VCTK"], help=\
        "Name of the dataset directory to process.")
-    parser.add_argument("--subfolders", type=str, default="train-clean-100,train-clean-360,dev-clean", help=\
+    parser.add_argument("--all_subfolders", type=list, default=["train-clean-100,train-clean-360,dev-clean", "wav48_silence_trimmed"], help=\
        "Comma-separated list of subfolders to process inside your dataset directory")
    args = parser.parse_args()

@@ -44,4 +41,15 @@ if __name__ == "__main__":
    # Preprocess the dataset
    print_args(args, parser)
    args.hparams = hparams.parse(args.hparams)
-    preprocess_dataset(**vars(args))
+    preprocess_func = {
+        "LibriSpeech": preprocess_librispeech,
+        "VCTK": preprocess_vctk,
+    }
+    args = vars(args)
+    for i in range(len(args["datasets_names"])):
+        dataset = args["datasets_names"][i]
+        subfolders = args["all_subfolders"][i]
+        print("Preprocessing %s" % dataset)
+
+        preprocess_func[dataset](datasets_root=args["datasets_root"], out_dir=args["out_dir"], n_processes=args["n_processes"], skip_existing=args["skip_existing"], hparams=args["hparams"],
+                       datasets_name=dataset, subfolders=subfolders)
--- a/synthesizer_train.py
+++ b/synthesizer_train.py
@@ -17,7 +17,7 @@ if __name__ == "__main__":
        "the wavs and the embeds.")
    parser.add_argument("-m", "--models_dir", type=Path, default="saved_models", help=\
        "Path to the output directory that will contain the saved model weights and the logs.")
-    parser.add_argument("-s", "--save_every", type=int, default=100, help= \
+    parser.add_argument("-s", "--save_every", type=int, default=1000, help= \
        "Number of steps between updates of the model on the disk. Set to 0 to never save the "
        "model.")
    parser.add_argument("-b", "--backup_every", type=int, default=25000, help= \
@@ -25,6 +25,8 @@ if __name__ == "__main__":
        "model.")
    parser.add_argument("-f", "--force_restart", action="store_true", help= \
        "Do not load any saved model and restart from scratch.")
+    parser.add_argument("--use_tb", action="store_true", help= \
+        "Use Tensorboard support")
    parser.add_argument("--hparams", default="", help=\
        "Hyperparameter overrides as a comma-separated list of name=value pairs")
    args = parser.parse_args()
--- a/vocoder/train.py
+++ b/vocoder/train.py
@@ -19,12 +19,19 @@ from utils.profiler import Profiler


 def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_truth: bool, save_every: int,
-          backup_every: int, force_restart: bool):
+          backup_every: int, force_restart: bool, use_tb: bool):
+    if use_tb:
+        print("Use Tensorboard")
+        import tensorflow as tf
+        import datetime
+        # Hide GPU from visible devices
+        log_dir = f"log/vocoder/tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+        train_summary_writer = tf.summary.create_file_writer(log_dir)
    # Check to make sure the hop length is correctly factorised
-    train_syn_dir = syn_dir.joinpath("train-clean")
-    train_voc_dir = voc_dir.joinpath("train-clean")    
-    dev_syn_dir = syn_dir.joinpath("dev-clean")
-    dev_voc_dir = voc_dir.joinpath("dev-clean")
+    train_syn_dir = syn_dir.joinpath("train")
+    train_voc_dir = voc_dir.joinpath("train")    
+    dev_syn_dir = syn_dir.joinpath("dev")
+    dev_voc_dir = voc_dir.joinpath("dev")
    assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length

    # Instantiate the model
@@ -58,23 +65,23 @@ def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_tr
    model_dir = models_dir / run_id
    model_dir.mkdir(exist_ok=True)
    weights_fpath = model_dir / "vocoder.pt"
-    train_loss_file_path = "vocoder_loss/vocoder_train_loss.npy"
-    dev_loss_file_path = "vocoder_loss/vocoder_dev_loss.npy"
+    # train_loss_file_path = "vocoder_loss/vocoder_train_loss.npy"
+    # dev_loss_file_path = "vocoder_loss/vocoder_dev_loss.npy"

-    if not exists("vocoder_loss"):
-        import os
-        os.mkdir("vocoder_loss")
+    # if not exists("vocoder_loss"):
+    #     import os
+    #     os.mkdir("vocoder_loss")
    if force_restart or not weights_fpath.exists():
        print("\nStarting the training of WaveRNN from scratch\n")
        model.save(weights_fpath, optimizer)
-        losses = []
-        dev_losses = []
+        # losses = []
+        # dev_losses = []
    else:
        print("\nLoading weights at %s" % weights_fpath)
        model.load(weights_fpath, optimizer)
        print("WaveRNN weights loaded from step %d" % model.step)
-        losses = list(np.load(train_loss_file_path)) if exists(train_loss_file_path) else []
-        dev_losses = list(np.load(dev_loss_file_path)) if exists(dev_loss_file_path) else []
+        # losses = list(np.load(train_loss_file_path)) if exists(train_loss_file_path) else []
+        # dev_losses = list(np.load(dev_loss_file_path)) if exists(dev_loss_file_path) else []

    # Initialize the dataset
    train_metadata_fpath = train_syn_dir.joinpath("train.txt") if ground_truth else \
@@ -96,8 +103,8 @@ def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_tr
    simple_table([('Batch size', hp.voc_batch_size),
                  ('LR', hp.voc_lr),
                  ('Sequence Len', hp.voc_seq_len)])
-    best_loss_file_path = "vocoder_loss/best_loss.npy"
-    best_loss = np.load(best_loss_file_path)[0] if exists(best_loss_file_path) else 1000
+    # best_loss_file_path = "vocoder_loss/best_loss.npy"
+    # best_loss = np.load(best_loss_file_path)[0] if exists(best_loss_file_path) else 1000

    # profiler = Profiler(summarize_every=10, disabled=False)
    for epoch in range(1, 350):
@@ -139,6 +146,10 @@ def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_tr
                f"{speed:.4f}steps/s | Step: {k}k | "
            stream(msg)

+            if use_tb:
+                with train_summary_writer.as_default():
+                    tf.summary.scalar('train_loss', train_loss_window.average, step=step)
+
            if backup_every != 0 and i % backup_every == 0 :
                model.checkpoint(model_dir, optimizer)

@@ -148,14 +159,18 @@ def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_tr
                    f"Train Loss: {train_loss_window.average:.4f} | Dev Loss: {dev_loss:.4f} | " \
                    f"{speed:.4f}steps/s | Step: {k}k | "
                stream(msg)
-                losses.append(train_loss_window.average)
-                np.save(train_loss_file_path, np.array(losses, dtype=float))
-                dev_losses.append(dev_loss)
-                np.save(dev_loss_file_path, np.array(dev_losses, dtype=float))
-                if dev_loss < best_loss :
-                    best_loss = dev_loss
-                    np.save(best_loss_file_path, np.array([best_loss]))
-                    model.save(weights_fpath, optimizer)
+
+                if use_tb:
+                    with train_summary_writer.as_default():
+                        tf.summary.scalar('val_loss', dev_loss, step=step)
+                # losses.append(train_loss_window.average)
+                # np.save(train_loss_file_path, np.array(losses, dtype=float))
+                # dev_losses.append(dev_loss)
+                # np.save(dev_loss_file_path, np.array(dev_losses, dtype=float))
+                # if dev_loss < best_loss :
+                    # best_loss = dev_loss
+                    # np.save(best_loss_file_path, np.array([best_loss]))
+                model.save(weights_fpath, optimizer)

            # profiler.tick("Extra saving")

--- a/vocoder_train.py
+++ b/vocoder_train.py
@@ -38,6 +38,8 @@ if __name__ == "__main__":
        "model.")
    parser.add_argument("-f", "--force_restart", action="store_true", help= \
        "Do not load any saved model and restart from scratch.")
+    parser.add_argument("--use_tb", action="store_true", help= \
+        "Use Tensorboard support")
    args = parser.parse_args()

    # Process the arguments