synthesizer/synthesize.py

import platform
from functools import partial
from pathlib import Path

import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

from synthesizer.hparams import hparams_debug_string
from synthesizer.models.tacotron import Tacotron
from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
from synthesizer.utils import data_parallel_workaround
from synthesizer.utils.symbols import symbols


def run_synthesis(in_dir: Path, out_dir: Path, syn_model_fpath: Path, hparams):
    # This generates ground truth-aligned mels for vocoder training
    train_in_dir = in_dir.joinpath("train")
    train_out_dir = out_dir.joinpath("train")
    dev_in_dir = in_dir.joinpath("dev")
    dev_out_dir = out_dir.joinpath("dev")
    train_synth_dir = train_out_dir / "mels_gta"
    train_synth_dir.mkdir(exist_ok=True, parents=True)
    dev_synth_dir = dev_out_dir / "mels_gta"
    dev_synth_dir.mkdir(exist_ok=True, parents=True)
    print(hparams_debug_string())

    # Check for GPU
    if torch.cuda.is_available():
        device = torch.device("cuda")
        if hparams.synthesis_batch_size % torch.cuda.device_count() != 0:
            raise ValueError("`hparams.synthesis_batch_size` must be evenly divisible by n_gpus!")
    else:
        device = torch.device("cpu")
    print("Synthesizer using device:", device)

    # Instantiate Tacotron model
    model = Tacotron(embed_dims=hparams.tts_embed_dims,
                     num_chars=len(symbols),
                     encoder_dims=hparams.tts_encoder_dims,
                     decoder_dims=hparams.tts_decoder_dims,
                     n_mels=hparams.num_mels,
                     fft_bins=hparams.num_mels,
                     postnet_dims=hparams.tts_postnet_dims,
                     encoder_K=hparams.tts_encoder_K,
                     lstm_dims=hparams.tts_lstm_dims,
                     postnet_K=hparams.tts_postnet_K,
                     num_highways=hparams.tts_num_highways,
                     dropout=0., # Use zero dropout for gta mels
                     stop_threshold=hparams.tts_stop_threshold,
                     speaker_embedding_size=hparams.speaker_embedding_size).to(device)

    # Load the weights
    print("\nLoading weights at %s" % syn_model_fpath)
    model.load(syn_model_fpath)
    print("Tacotron weights loaded from step %d" % model.step)

    # Synthesize using same reduction factor as the model is currently trained
    r = np.int32(model.r)

    # Set model to eval mode (disable gradient and zoneout)
    model.eval()

    # Initialize the dataset
    train_metadata_fpath = train_in_dir.joinpath("train.txt")
    train_mel_dir = train_in_dir.joinpath("mels")
    train_embed_dir = train_in_dir.joinpath("embeds")
    dev_metadata_fpath = dev_in_dir.joinpath("dev.txt")
    dev_mel_dir = dev_in_dir.joinpath("mels")
    dev_embed_dir = dev_in_dir.joinpath("embeds")

    train_dataset = SynthesizerDataset(train_metadata_fpath, train_mel_dir, train_embed_dir, hparams)
    dev_dataset = SynthesizerDataset(dev_metadata_fpath, dev_mel_dir, dev_embed_dir, hparams)
    collate_fn = partial(collate_synthesizer, r=r, hparams=hparams)
    train_data_loader = DataLoader(train_dataset, hparams.synthesis_batch_size, collate_fn=collate_fn, num_workers=2)
    dev_data_loader = DataLoader(dev_dataset, hparams.synthesis_batch_size, collate_fn=collate_fn, num_workers=2)

    # Generate train GTA mels
    train_meta_out_fpath = train_out_dir / "synthesized.txt"
    with train_meta_out_fpath.open("w") as file:
        for i, (texts, mels, embeds, idx) in tqdm(enumerate(train_data_loader), total=len(train_data_loader)):
            texts, mels, embeds = texts.to(device), mels.to(device), embeds.to(device)

            # Parallelize model onto GPUS using workaround due to python bug
            # if device.type == "cuda" and torch.cuda.device_count() > 1:
            #     _, mels_out, _ = data_parallel_workaround(model, texts, mels, embeds)
            # else:
            _, mels_out, _, _ = model(texts, mels, embeds)

            for j, k in enumerate(idx):
                # Note: outputs mel-spectrogram files and target ones have same names, just different folders
                mel_filename = Path(train_synth_dir).joinpath(train_dataset.metadata[k][1])
                mel_out = mels_out[j].detach().cpu().numpy().T

                # Use the length of the ground truth mel to remove padding from the generated mels
                mel_out = mel_out[:int(train_dataset.metadata[k][4])]

                # Write the spectrogram to disk
                np.save(mel_filename, mel_out, allow_pickle=False)

                # Write metadata into the synthesized file
                file.write("|".join(train_dataset.metadata[k]))
                
    # Generate dev GTA mels
    dev_meta_out_fpath = dev_out_dir / "synthesized.txt"
    with dev_meta_out_fpath.open("w") as file:
        for i, (texts, mels, embeds, idx) in tqdm(enumerate(dev_data_loader), total=len(dev_data_loader)):
            texts, mels, embeds = texts.to(device), mels.to(device), embeds.to(device)

            # Parallelize model onto GPUS using workaround due to python bug
            # if device.type == "cuda" and torch.cuda.device_count() > 1:
            #     _, mels_out, _ = data_parallel_workaround(model, texts, mels, embeds)
            # else:
            _, mels_out, _, _ = model(texts, mels, embeds)

            for j, k in enumerate(idx):
                # Note: outputs mel-spectrogram files and target ones have same names, just different folders
                mel_filename = Path(dev_synth_dir).joinpath(dev_dataset.metadata[k][1])
                mel_out = mels_out[j].detach().cpu().numpy().T

                # Use the length of the ground truth mel to remove padding from the generated mels
                mel_out = mel_out[:int(dev_dataset.metadata[k][4])]

                # Write the spectrogram to disk
                np.save(mel_filename, mel_out, allow_pickle=False)

                # Write metadata into the synthesized file
                file.write("|".join(dev_dataset.metadata[k]))
new commits 2023-10-27 15:03:22 +08:00			`import platform`
			`from functools import partial`
			`from pathlib import Path`

			`import numpy as np`
			`import torch`
			`from torch.utils.data import DataLoader`
			`from tqdm import tqdm`

			`from synthesizer.hparams import hparams_debug_string`
			`from synthesizer.models.tacotron import Tacotron`
			`from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer`
			`from synthesizer.utils import data_parallel_workaround`
			`from synthesizer.utils.symbols import symbols`


			`def run_synthesis(in_dir: Path, out_dir: Path, syn_model_fpath: Path, hparams):`
			`# This generates ground truth-aligned mels for vocoder training`
			`train_in_dir = in_dir.joinpath("train")`
			`train_out_dir = out_dir.joinpath("train")`
			`dev_in_dir = in_dir.joinpath("dev")`
			`dev_out_dir = out_dir.joinpath("dev")`
			`train_synth_dir = train_out_dir / "mels_gta"`
			`train_synth_dir.mkdir(exist_ok=True, parents=True)`
			`dev_synth_dir = dev_out_dir / "mels_gta"`
			`dev_synth_dir.mkdir(exist_ok=True, parents=True)`
			`print(hparams_debug_string())`

			`# Check for GPU`
			`if torch.cuda.is_available():`
			`device = torch.device("cuda")`
			`if hparams.synthesis_batch_size % torch.cuda.device_count() != 0:`
			raise ValueError("`hparams.synthesis_batch_size` must be evenly divisible by n_gpus!")
			`else:`
			`device = torch.device("cpu")`
			`print("Synthesizer using device:", device)`

			`# Instantiate Tacotron model`
			`model = Tacotron(embed_dims=hparams.tts_embed_dims,`
			`num_chars=len(symbols),`
			`encoder_dims=hparams.tts_encoder_dims,`
			`decoder_dims=hparams.tts_decoder_dims,`
			`n_mels=hparams.num_mels,`
			`fft_bins=hparams.num_mels,`
			`postnet_dims=hparams.tts_postnet_dims,`
			`encoder_K=hparams.tts_encoder_K,`
			`lstm_dims=hparams.tts_lstm_dims,`
			`postnet_K=hparams.tts_postnet_K,`
			`num_highways=hparams.tts_num_highways,`
			`dropout=0., # Use zero dropout for gta mels`
			`stop_threshold=hparams.tts_stop_threshold,`
			`speaker_embedding_size=hparams.speaker_embedding_size).to(device)`

			`# Load the weights`
			`print("\nLoading weights at %s" % syn_model_fpath)`
			`model.load(syn_model_fpath)`
			`print("Tacotron weights loaded from step %d" % model.step)`

			`# Synthesize using same reduction factor as the model is currently trained`
			`r = np.int32(model.r)`

			`# Set model to eval mode (disable gradient and zoneout)`
			`model.eval()`

			`# Initialize the dataset`
			`train_metadata_fpath = train_in_dir.joinpath("train.txt")`
			`train_mel_dir = train_in_dir.joinpath("mels")`
			`train_embed_dir = train_in_dir.joinpath("embeds")`
			`dev_metadata_fpath = dev_in_dir.joinpath("dev.txt")`
			`dev_mel_dir = dev_in_dir.joinpath("mels")`
			`dev_embed_dir = dev_in_dir.joinpath("embeds")`

			`train_dataset = SynthesizerDataset(train_metadata_fpath, train_mel_dir, train_embed_dir, hparams)`
			`dev_dataset = SynthesizerDataset(dev_metadata_fpath, dev_mel_dir, dev_embed_dir, hparams)`
			`collate_fn = partial(collate_synthesizer, r=r, hparams=hparams)`
			`train_data_loader = DataLoader(train_dataset, hparams.synthesis_batch_size, collate_fn=collate_fn, num_workers=2)`
			`dev_data_loader = DataLoader(dev_dataset, hparams.synthesis_batch_size, collate_fn=collate_fn, num_workers=2)`

			`# Generate train GTA mels`
			`train_meta_out_fpath = train_out_dir / "synthesized.txt"`
			`with train_meta_out_fpath.open("w") as file:`
			`for i, (texts, mels, embeds, idx) in tqdm(enumerate(train_data_loader), total=len(train_data_loader)):`
			`texts, mels, embeds = texts.to(device), mels.to(device), embeds.to(device)`

			`# Parallelize model onto GPUS using workaround due to python bug`
			`# if device.type == "cuda" and torch.cuda.device_count() > 1:`
			`# _, mels_out, _ = data_parallel_workaround(model, texts, mels, embeds)`
			`# else:`
			`_, mels_out, _, _ = model(texts, mels, embeds)`

			`for j, k in enumerate(idx):`
			`# Note: outputs mel-spectrogram files and target ones have same names, just different folders`
			`mel_filename = Path(train_synth_dir).joinpath(train_dataset.metadata[k][1])`
			`mel_out = mels_out[j].detach().cpu().numpy().T`

			`# Use the length of the ground truth mel to remove padding from the generated mels`
			`mel_out = mel_out[:int(train_dataset.metadata[k][4])]`

			`# Write the spectrogram to disk`
			`np.save(mel_filename, mel_out, allow_pickle=False)`

			`# Write metadata into the synthesized file`
			`file.write("\|".join(train_dataset.metadata[k]))`

			`# Generate dev GTA mels`
			`dev_meta_out_fpath = dev_out_dir / "synthesized.txt"`
			`with dev_meta_out_fpath.open("w") as file:`
			`for i, (texts, mels, embeds, idx) in tqdm(enumerate(dev_data_loader), total=len(dev_data_loader)):`
			`texts, mels, embeds = texts.to(device), mels.to(device), embeds.to(device)`

			`# Parallelize model onto GPUS using workaround due to python bug`
			`# if device.type == "cuda" and torch.cuda.device_count() > 1:`
			`# _, mels_out, _ = data_parallel_workaround(model, texts, mels, embeds)`
			`# else:`
			`_, mels_out, _, _ = model(texts, mels, embeds)`

			`for j, k in enumerate(idx):`
			`# Note: outputs mel-spectrogram files and target ones have same names, just different folders`
			`mel_filename = Path(dev_synth_dir).joinpath(dev_dataset.metadata[k][1])`
			`mel_out = mels_out[j].detach().cpu().numpy().T`

			`# Use the length of the ground truth mel to remove padding from the generated mels`
			`mel_out = mel_out[:int(dev_dataset.metadata[k][4])]`

			`# Write the spectrogram to disk`
			`np.save(mel_filename, mel_out, allow_pickle=False)`

			`# Write metadata into the synthesized file`
			`file.write("\|".join(dev_dataset.metadata[k]))`