mirror of
https://github.com/liuhaozhe6788/voice-cloning-collab.git
synced 2026-05-18 05:04:51 +02:00
131 lines
6.0 KiB
Python
131 lines
6.0 KiB
Python
|
|
import platform
|
||
|
|
from functools import partial
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
import numpy as np
|
||
|
|
import torch
|
||
|
|
from torch.utils.data import DataLoader
|
||
|
|
from tqdm import tqdm
|
||
|
|
|
||
|
|
from synthesizer.hparams import hparams_debug_string
|
||
|
|
from synthesizer.models.tacotron import Tacotron
|
||
|
|
from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
|
||
|
|
from synthesizer.utils import data_parallel_workaround
|
||
|
|
from synthesizer.utils.symbols import symbols
|
||
|
|
|
||
|
|
|
||
|
|
def run_synthesis(in_dir: Path, out_dir: Path, syn_model_fpath: Path, hparams):
|
||
|
|
# This generates ground truth-aligned mels for vocoder training
|
||
|
|
train_in_dir = in_dir.joinpath("train")
|
||
|
|
train_out_dir = out_dir.joinpath("train")
|
||
|
|
dev_in_dir = in_dir.joinpath("dev")
|
||
|
|
dev_out_dir = out_dir.joinpath("dev")
|
||
|
|
train_synth_dir = train_out_dir / "mels_gta"
|
||
|
|
train_synth_dir.mkdir(exist_ok=True, parents=True)
|
||
|
|
dev_synth_dir = dev_out_dir / "mels_gta"
|
||
|
|
dev_synth_dir.mkdir(exist_ok=True, parents=True)
|
||
|
|
print(hparams_debug_string())
|
||
|
|
|
||
|
|
# Check for GPU
|
||
|
|
if torch.cuda.is_available():
|
||
|
|
device = torch.device("cuda")
|
||
|
|
if hparams.synthesis_batch_size % torch.cuda.device_count() != 0:
|
||
|
|
raise ValueError("`hparams.synthesis_batch_size` must be evenly divisible by n_gpus!")
|
||
|
|
else:
|
||
|
|
device = torch.device("cpu")
|
||
|
|
print("Synthesizer using device:", device)
|
||
|
|
|
||
|
|
# Instantiate Tacotron model
|
||
|
|
model = Tacotron(embed_dims=hparams.tts_embed_dims,
|
||
|
|
num_chars=len(symbols),
|
||
|
|
encoder_dims=hparams.tts_encoder_dims,
|
||
|
|
decoder_dims=hparams.tts_decoder_dims,
|
||
|
|
n_mels=hparams.num_mels,
|
||
|
|
fft_bins=hparams.num_mels,
|
||
|
|
postnet_dims=hparams.tts_postnet_dims,
|
||
|
|
encoder_K=hparams.tts_encoder_K,
|
||
|
|
lstm_dims=hparams.tts_lstm_dims,
|
||
|
|
postnet_K=hparams.tts_postnet_K,
|
||
|
|
num_highways=hparams.tts_num_highways,
|
||
|
|
dropout=0., # Use zero dropout for gta mels
|
||
|
|
stop_threshold=hparams.tts_stop_threshold,
|
||
|
|
speaker_embedding_size=hparams.speaker_embedding_size).to(device)
|
||
|
|
|
||
|
|
# Load the weights
|
||
|
|
print("\nLoading weights at %s" % syn_model_fpath)
|
||
|
|
model.load(syn_model_fpath)
|
||
|
|
print("Tacotron weights loaded from step %d" % model.step)
|
||
|
|
|
||
|
|
# Synthesize using same reduction factor as the model is currently trained
|
||
|
|
r = np.int32(model.r)
|
||
|
|
|
||
|
|
# Set model to eval mode (disable gradient and zoneout)
|
||
|
|
model.eval()
|
||
|
|
|
||
|
|
# Initialize the dataset
|
||
|
|
train_metadata_fpath = train_in_dir.joinpath("train.txt")
|
||
|
|
train_mel_dir = train_in_dir.joinpath("mels")
|
||
|
|
train_embed_dir = train_in_dir.joinpath("embeds")
|
||
|
|
dev_metadata_fpath = dev_in_dir.joinpath("dev.txt")
|
||
|
|
dev_mel_dir = dev_in_dir.joinpath("mels")
|
||
|
|
dev_embed_dir = dev_in_dir.joinpath("embeds")
|
||
|
|
|
||
|
|
train_dataset = SynthesizerDataset(train_metadata_fpath, train_mel_dir, train_embed_dir, hparams)
|
||
|
|
dev_dataset = SynthesizerDataset(dev_metadata_fpath, dev_mel_dir, dev_embed_dir, hparams)
|
||
|
|
collate_fn = partial(collate_synthesizer, r=r, hparams=hparams)
|
||
|
|
train_data_loader = DataLoader(train_dataset, hparams.synthesis_batch_size, collate_fn=collate_fn, num_workers=2)
|
||
|
|
dev_data_loader = DataLoader(dev_dataset, hparams.synthesis_batch_size, collate_fn=collate_fn, num_workers=2)
|
||
|
|
|
||
|
|
# Generate train GTA mels
|
||
|
|
train_meta_out_fpath = train_out_dir / "synthesized.txt"
|
||
|
|
with train_meta_out_fpath.open("w") as file:
|
||
|
|
for i, (texts, mels, embeds, idx) in tqdm(enumerate(train_data_loader), total=len(train_data_loader)):
|
||
|
|
texts, mels, embeds = texts.to(device), mels.to(device), embeds.to(device)
|
||
|
|
|
||
|
|
# Parallelize model onto GPUS using workaround due to python bug
|
||
|
|
# if device.type == "cuda" and torch.cuda.device_count() > 1:
|
||
|
|
# _, mels_out, _ = data_parallel_workaround(model, texts, mels, embeds)
|
||
|
|
# else:
|
||
|
|
_, mels_out, _, _ = model(texts, mels, embeds)
|
||
|
|
|
||
|
|
for j, k in enumerate(idx):
|
||
|
|
# Note: outputs mel-spectrogram files and target ones have same names, just different folders
|
||
|
|
mel_filename = Path(train_synth_dir).joinpath(train_dataset.metadata[k][1])
|
||
|
|
mel_out = mels_out[j].detach().cpu().numpy().T
|
||
|
|
|
||
|
|
# Use the length of the ground truth mel to remove padding from the generated mels
|
||
|
|
mel_out = mel_out[:int(train_dataset.metadata[k][4])]
|
||
|
|
|
||
|
|
# Write the spectrogram to disk
|
||
|
|
np.save(mel_filename, mel_out, allow_pickle=False)
|
||
|
|
|
||
|
|
# Write metadata into the synthesized file
|
||
|
|
file.write("|".join(train_dataset.metadata[k]))
|
||
|
|
|
||
|
|
# Generate dev GTA mels
|
||
|
|
dev_meta_out_fpath = dev_out_dir / "synthesized.txt"
|
||
|
|
with dev_meta_out_fpath.open("w") as file:
|
||
|
|
for i, (texts, mels, embeds, idx) in tqdm(enumerate(dev_data_loader), total=len(dev_data_loader)):
|
||
|
|
texts, mels, embeds = texts.to(device), mels.to(device), embeds.to(device)
|
||
|
|
|
||
|
|
# Parallelize model onto GPUS using workaround due to python bug
|
||
|
|
# if device.type == "cuda" and torch.cuda.device_count() > 1:
|
||
|
|
# _, mels_out, _ = data_parallel_workaround(model, texts, mels, embeds)
|
||
|
|
# else:
|
||
|
|
_, mels_out, _, _ = model(texts, mels, embeds)
|
||
|
|
|
||
|
|
for j, k in enumerate(idx):
|
||
|
|
# Note: outputs mel-spectrogram files and target ones have same names, just different folders
|
||
|
|
mel_filename = Path(dev_synth_dir).joinpath(dev_dataset.metadata[k][1])
|
||
|
|
mel_out = mels_out[j].detach().cpu().numpy().T
|
||
|
|
|
||
|
|
# Use the length of the ground truth mel to remove padding from the generated mels
|
||
|
|
mel_out = mel_out[:int(dev_dataset.metadata[k][4])]
|
||
|
|
|
||
|
|
# Write the spectrogram to disk
|
||
|
|
np.save(mel_filename, mel_out, allow_pickle=False)
|
||
|
|
|
||
|
|
# Write metadata into the synthesized file
|
||
|
|
file.write("|".join(dev_dataset.metadata[k]))
|
||
|
|
|