mirror of
https://github.com/liuhaozhe6788/voice-cloning-collab.git
synced 2026-05-18 05:04:51 +02:00
new commits
This commit is contained in:
14
demo_cli.py
14
demo_cli.py
@@ -44,7 +44,7 @@ if __name__ == '__main__':
|
||||
|
||||
import encoder.inference
|
||||
import encoder.params_data
|
||||
from synthesizer.inference import Synthesizer
|
||||
from synthesizer.inference import Synthesizer_infer
|
||||
from synthesizer.utils.cleaners import add_breaks, english_cleaners_predict
|
||||
from vocoder import inference as vocoder
|
||||
from vocoder.display import save_attention_multiple, save_spectrogram, save_stop_tokens
|
||||
@@ -73,7 +73,7 @@ if __name__ == '__main__':
|
||||
print("Preparing the encoder and the synthesizer...")
|
||||
ensure_default_models(args.run_id, Path("saved_models"))
|
||||
encoder.inference.load_model(list(args.models_dir.glob(f"{args.run_id}/encoder.pt"))[0])
|
||||
synthesizer = Synthesizer(list(args.models_dir.glob(f"{args.run_id}/synthesizer.pt"))[0])
|
||||
synthesizer = Synthesizer_infer(list(args.models_dir.glob(f"{args.run_id}/synthesizer.pt"))[0])
|
||||
if not args.griffin_lim:
|
||||
vocoder.load_model(list(args.models_dir.glob(f"{args.run_id}/vocoder.pt"))[0])
|
||||
|
||||
@@ -201,7 +201,7 @@ if __name__ == '__main__':
|
||||
|
||||
if os.path.exists(standard_fpath):
|
||||
|
||||
standard_wav = Synthesizer.load_preprocess_wav(standard_fpath)
|
||||
standard_wav = Synthesizer_infer.load_preprocess_wav(standard_fpath)
|
||||
preprocessed_standard_wav = encoder.inference.preprocess_wav(standard_wav)
|
||||
print("Loaded standard audio file successfully")
|
||||
|
||||
@@ -223,7 +223,7 @@ if __name__ == '__main__':
|
||||
# If seed is specified, reset torch seed and force synthesizer reload
|
||||
if args.seed is not None:
|
||||
torch.manual_seed(args.seed)
|
||||
synthesizer = Synthesizer(args.syn_model_fpath)
|
||||
synthesizer = Synthesizer_infer(args.syn_model_fpath)
|
||||
|
||||
# The synthesizer works in batch, so you need to put your data in a list or numpy array
|
||||
def preprocess_text(text):
|
||||
@@ -267,17 +267,17 @@ if __name__ == '__main__':
|
||||
if not args.griffin_lim:
|
||||
wav = vocoder.infer_waveform(spec, target=vocoder.hp.voc_target, overlap=vocoder.hp.voc_overlap, crossfade=vocoder.hp.is_crossfade)
|
||||
else:
|
||||
wav = Synthesizer.griffin_lim(spec)
|
||||
wav = Synthesizer_infer.griffin_lim(spec)
|
||||
|
||||
end_voc = time.time()
|
||||
print(f"Prediction time of vocoder is {end_voc - start_voc}s")
|
||||
print(f"Prediction time of TTS is {end_voc - start_syn}s")
|
||||
|
||||
# Add breaks
|
||||
b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
|
||||
b_ends = np.cumsum(np.array(breaks) * Synthesizer_infer.hparams.hop_size)
|
||||
b_starts = np.concatenate(([0], b_ends[:-1]))
|
||||
wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
|
||||
breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
|
||||
breaks = [np.zeros(int(0.15 * Synthesizer_infer.sample_rate))] * len(breaks)
|
||||
wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
|
||||
|
||||
# Trim excess silences to compensate for gaps in spectrograms (issue #53)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import argparse
|
||||
import os
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
|
||||
from pathlib import Path
|
||||
|
||||
from toolbox import Toolbox
|
||||
@@ -12,7 +13,7 @@ if __name__ == '__main__':
|
||||
description="Runs the toolbox.",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||
)
|
||||
parser.add_argument("--run_id", type=str, default="default", help= \
|
||||
parser.add_argument("--run_id", type=str, default="20230609", help= \
|
||||
"Name for this model. By default, training outputs will be stored to saved_models/<run_id>/. If a model state "
|
||||
"from the same run ID was previously saved, the training will restart from there. Pass -f to overwrite saved "
|
||||
"states and restart from scratch.")
|
||||
|
||||
@@ -3,8 +3,8 @@ from ffmpeg import audio
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
import parselmouth
|
||||
from synthesizer.inference import Synthesizer
|
||||
from synthesizer.hparams import hparams
|
||||
from synthesizer.inference import Synthesizer_infer
|
||||
from synthesizer.hparams import syn_hparams
|
||||
import soundfile as sf
|
||||
from parselmouth.praat import run_file
|
||||
|
||||
@@ -67,13 +67,13 @@ def TransFormat(fullpath, out_suffix):
|
||||
is_wav_file = False # 原始音频的后缀是否为.wav
|
||||
path_, name = os.path.split(fullpath)
|
||||
name, suffix = os.path.splitext(name)
|
||||
wav = Synthesizer.load_preprocess_wav(fullpath)
|
||||
wav = Synthesizer_infer.load_preprocess_wav(fullpath)
|
||||
if suffix == ".wav": # 如果原始音频的后缀为.wav,则不用进行格式转换
|
||||
is_wav_file = True
|
||||
return is_wav_file, wav, str(fullpath)
|
||||
else: # 如果原始音频的后缀不是.wav,则需要进行格式转换
|
||||
out_file = os.path.join(path_, name + "." + str(out_suffix))
|
||||
sf.write(out_file, wav.astype(np.float32), hparams.sample_rate)
|
||||
sf.write(out_file, wav.astype(np.float32), syn_hparams.sample_rate)
|
||||
return is_wav_file, wav, str(out_file)
|
||||
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ class HParams(object):
|
||||
self.__dict__[k] = ast.literal_eval(values[keys.index(k)])
|
||||
return self
|
||||
|
||||
hparams = HParams(
|
||||
syn_hparams = HParams(
|
||||
### Signal Processing (used in both synthesizer and vocoder)
|
||||
sample_rate = 16000,
|
||||
n_fft = 800,
|
||||
@@ -89,4 +89,4 @@ hparams = HParams(
|
||||
)
|
||||
|
||||
def hparams_debug_string():
|
||||
return str(hparams)
|
||||
return str(syn_hparams)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import torch
|
||||
from synthesizer import audio
|
||||
from synthesizer.hparams import hparams
|
||||
from synthesizer.hparams import syn_hparams
|
||||
from synthesizer.models.tacotron import Tacotron
|
||||
from synthesizer.utils.symbols import symbols
|
||||
from synthesizer.utils.text import text_to_sequence
|
||||
@@ -11,9 +11,9 @@ import numpy as np
|
||||
import librosa
|
||||
|
||||
|
||||
class Synthesizer:
|
||||
sample_rate = hparams.sample_rate
|
||||
hparams = hparams
|
||||
class Synthesizer_infer:
|
||||
sample_rate = syn_hparams.sample_rate
|
||||
hparams = syn_hparams
|
||||
|
||||
def __init__(self, model_fpath: Path, verbose=True):
|
||||
"""
|
||||
@@ -46,20 +46,20 @@ class Synthesizer:
|
||||
"""
|
||||
Instantiates and loads the model given the weights file that was passed in the constructor.
|
||||
"""
|
||||
self._model = Tacotron(embed_dims=hparams.tts_embed_dims,
|
||||
self._model = Tacotron(embed_dims=syn_hparams.tts_embed_dims,
|
||||
num_chars=len(symbols),
|
||||
encoder_dims=hparams.tts_encoder_dims,
|
||||
decoder_dims=hparams.tts_decoder_dims,
|
||||
n_mels=hparams.num_mels,
|
||||
fft_bins=hparams.num_mels,
|
||||
postnet_dims=hparams.tts_postnet_dims,
|
||||
encoder_K=hparams.tts_encoder_K,
|
||||
lstm_dims=hparams.tts_lstm_dims,
|
||||
postnet_K=hparams.tts_postnet_K,
|
||||
num_highways=hparams.tts_num_highways,
|
||||
dropout=hparams.tts_dropout,
|
||||
stop_threshold=hparams.tts_stop_threshold,
|
||||
speaker_embedding_size=hparams.speaker_embedding_size).to(self.device)
|
||||
encoder_dims=syn_hparams.tts_encoder_dims,
|
||||
decoder_dims=syn_hparams.tts_decoder_dims,
|
||||
n_mels=syn_hparams.num_mels,
|
||||
fft_bins=syn_hparams.num_mels,
|
||||
postnet_dims=syn_hparams.tts_postnet_dims,
|
||||
encoder_K=syn_hparams.tts_encoder_K,
|
||||
lstm_dims=syn_hparams.tts_lstm_dims,
|
||||
postnet_K=syn_hparams.tts_postnet_K,
|
||||
num_highways=syn_hparams.tts_num_highways,
|
||||
dropout=syn_hparams.tts_dropout,
|
||||
stop_threshold=syn_hparams.tts_stop_threshold,
|
||||
speaker_embedding_size=syn_hparams.speaker_embedding_size).to(self.device)
|
||||
|
||||
self._model.load(self.model_fpath)
|
||||
self._model.eval()
|
||||
@@ -91,10 +91,10 @@ class Synthesizer:
|
||||
embeddings = [embeddings]
|
||||
|
||||
# Batch inputs
|
||||
batched_inputs = [inputs[i:i+hparams.synthesis_batch_size]
|
||||
for i in range(0, len(inputs), hparams.synthesis_batch_size)]
|
||||
batched_embeds = [embeddings[i:i+hparams.synthesis_batch_size]
|
||||
for i in range(0, len(embeddings), hparams.synthesis_batch_size)]
|
||||
batched_inputs = [inputs[i:i+syn_hparams.synthesis_batch_size]
|
||||
for i in range(0, len(inputs), syn_hparams.synthesis_batch_size)]
|
||||
batched_embeds = [embeddings[i:i+syn_hparams.synthesis_batch_size]
|
||||
for i in range(0, len(embeddings), syn_hparams.synthesis_batch_size)]
|
||||
|
||||
specs = []
|
||||
for i, batch in enumerate(batched_inputs, 1):
|
||||
@@ -121,12 +121,12 @@ class Synthesizer:
|
||||
stop_tokens = stop_tokens.detach().cpu().numpy()
|
||||
for m in mels:
|
||||
# Trim silence from end of each spectrogram
|
||||
while np.max(m[:, -1]) < hparams.tts_stop_threshold:
|
||||
while np.max(m[:, -1]) < syn_hparams.tts_stop_threshold:
|
||||
if m.shape[-1] == 1:
|
||||
break
|
||||
m = m[:, :-1]
|
||||
# Trim silence from start of each spectrogram
|
||||
while np.max(m[:, 0]) < hparams.tts_start_threshold:
|
||||
while np.max(m[:, 0]) < syn_hparams.tts_start_threshold:
|
||||
if m.shape[-1] == 1:
|
||||
break
|
||||
m = m[:, 1:]
|
||||
@@ -142,9 +142,9 @@ class Synthesizer:
|
||||
Loads and preprocesses an audio file under the same conditions the audio files were used to
|
||||
train the synthesizer.
|
||||
"""
|
||||
wav = librosa.load(str(fpath), hparams.sample_rate)[0]
|
||||
if hparams.rescale:
|
||||
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
||||
wav = librosa.load(str(fpath), syn_hparams.sample_rate)[0]
|
||||
if syn_hparams.rescale:
|
||||
wav = wav / np.abs(wav).max() * syn_hparams.rescaling_max
|
||||
return wav
|
||||
|
||||
@staticmethod
|
||||
@@ -154,11 +154,11 @@ class Synthesizer:
|
||||
were fed to the synthesizer when training.
|
||||
"""
|
||||
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
|
||||
wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
|
||||
wav = Synthesizer_infer.load_preprocess_wav(fpath_or_wav)
|
||||
else:
|
||||
wav = fpath_or_wav
|
||||
|
||||
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
|
||||
mel_spectrogram = audio.melspectrogram(wav, syn_hparams).astype(np.float32)
|
||||
return mel_spectrogram
|
||||
|
||||
@staticmethod
|
||||
@@ -167,7 +167,7 @@ class Synthesizer:
|
||||
Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
|
||||
with the same parameters present in hparams.py.
|
||||
"""
|
||||
return audio.inv_mel_spectrogram(mel, hparams)
|
||||
return audio.inv_mel_spectrogram(mel, syn_hparams)
|
||||
|
||||
|
||||
def pad1d(x, max_len, pad_value=0):
|
||||
|
||||
@@ -2,7 +2,7 @@ from multiprocessing.pool import Pool
|
||||
from synthesizer import audio
|
||||
from functools import partial
|
||||
from itertools import chain, groupby
|
||||
from encoder import inference as encoder
|
||||
from encoder import inference as encoder_infer
|
||||
from pathlib import Path
|
||||
from utils import logmmse
|
||||
from tqdm import tqdm
|
||||
@@ -330,7 +330,7 @@ def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
||||
return None
|
||||
|
||||
# Trim silence
|
||||
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=trim_silence)
|
||||
wav = encoder_infer.preprocess_wav(wav, normalize=False, trim_silence=trim_silence)
|
||||
|
||||
# Skip utterances that are too short
|
||||
if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
|
||||
@@ -353,14 +353,14 @@ def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
|
||||
|
||||
|
||||
def embed_utterance(fpaths, encoder_model_fpath):
|
||||
if not encoder.is_loaded():
|
||||
encoder.load_model(encoder_model_fpath)
|
||||
if not encoder_infer.is_loaded():
|
||||
encoder_infer.load_model(encoder_model_fpath)
|
||||
|
||||
# Compute the speaker embedding of the utterance
|
||||
wav_fpath, embed_fpath = fpaths
|
||||
wav = np.load(wav_fpath)
|
||||
wav = encoder.preprocess_wav(wav)
|
||||
embed = encoder.embed_utterance(wav)
|
||||
wav = encoder_infer.preprocess_wav(wav)
|
||||
embed = encoder_infer.embed_utterance(wav)
|
||||
np.save(embed_fpath, embed, allow_pickle=False)
|
||||
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from synthesizer.preprocess import preprocess_librispeech, preprocess_vctk
|
||||
from synthesizer.hparams import hparams
|
||||
from synthesizer.hparams import syn_hparams
|
||||
from utils.argutils import print_args
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
@@ -40,7 +40,7 @@ if __name__ == "__main__":
|
||||
|
||||
# Preprocess the dataset
|
||||
print_args(args, parser)
|
||||
args.hparams = hparams.parse(args.hparams)
|
||||
args.hparams = syn_hparams.parse(args.hparams)
|
||||
preprocess_func = {
|
||||
"LibriSpeech": preprocess_librispeech,
|
||||
"VCTK": preprocess_vctk,
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from pathlib import Path
|
||||
|
||||
from synthesizer.hparams import hparams
|
||||
from synthesizer.hparams import syn_hparams
|
||||
from synthesizer.train import train
|
||||
from utils.argutils import print_args
|
||||
import argparse
|
||||
@@ -32,7 +32,7 @@ if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
print_args(args, parser)
|
||||
|
||||
args.hparams = hparams.parse(args.hparams)
|
||||
args.hparams = syn_hparams.parse(args.hparams)
|
||||
|
||||
# Run the training
|
||||
train(**vars(args))
|
||||
|
||||
@@ -7,10 +7,13 @@ import re
|
||||
import numpy as np
|
||||
import torch
|
||||
import soundfile as sf
|
||||
import spacy
|
||||
|
||||
from encoder import inference as encoder
|
||||
from synthesizer.inference import Synthesizer
|
||||
from synthesizer.hparams import hparams
|
||||
import encoder
|
||||
from encoder import inference as encoder_infer
|
||||
from synthesizer.inference import Synthesizer_infer
|
||||
from synthesizer.utils.cleaners import add_breaks, english_cleaners_predict
|
||||
from synthesizer.hparams import syn_hparams
|
||||
from toolbox.ui import UI
|
||||
from toolbox.utterance import Utterance
|
||||
from vocoder import inference as vocoder
|
||||
@@ -52,12 +55,13 @@ class Toolbox:
|
||||
self.utterances = set()
|
||||
self.current_generated = (None, None, None, None) # speaker_name, spec, breaks, wav
|
||||
|
||||
self.synthesizer = None # type: Synthesizer
|
||||
self.synthesizer = None # type: Synthesizer_infer
|
||||
self.current_wav = None
|
||||
self.waves_list = []
|
||||
self.waves_count = 0
|
||||
self.waves_namelist = []
|
||||
self.start_generate_time = None
|
||||
self.nlp = spacy.load('en_core_web_sm')
|
||||
|
||||
# Check for webrtcvad (enables removal of silences in vocoder output)
|
||||
try:
|
||||
@@ -100,13 +104,13 @@ class Toolbox:
|
||||
self.ui.browser_browse_button.clicked.connect(func)
|
||||
func = lambda: self.ui.draw_utterance(self.ui.selected_utterance, "current")
|
||||
self.ui.utterance_history.currentIndexChanged.connect(func)
|
||||
func = lambda: self.ui.play(self.ui.selected_utterance.wav, Synthesizer.sample_rate)
|
||||
func = lambda: self.ui.play(self.ui.selected_utterance.wav, Synthesizer_infer.sample_rate)
|
||||
self.ui.play_button.clicked.connect(func)
|
||||
self.ui.stop_button.clicked.connect(self.ui.stop)
|
||||
self.ui.record_button.clicked.connect(self.record)
|
||||
|
||||
#Audio
|
||||
self.ui.setup_audio_devices(Synthesizer.sample_rate)
|
||||
self.ui.setup_audio_devices(Synthesizer_infer.sample_rate)
|
||||
|
||||
#Wav playback & save
|
||||
func = lambda: self.replay_last_wav()
|
||||
@@ -129,10 +133,10 @@ class Toolbox:
|
||||
self.current_wav = self.waves_list[index]
|
||||
|
||||
def export_current_wave(self):
|
||||
self.ui.save_audio_file(self.current_wav, Synthesizer.sample_rate)
|
||||
self.ui.save_audio_file(self.current_wav, Synthesizer_infer.sample_rate)
|
||||
|
||||
def replay_last_wav(self):
|
||||
self.ui.play(self.current_wav, Synthesizer.sample_rate)
|
||||
self.ui.play(self.current_wav, Synthesizer_infer.sample_rate)
|
||||
|
||||
def reset_ui(self, run_id: str, models_dir: Path, seed: int=None):
|
||||
self.ui.populate_browser(self.datasets_root, recognized_datasets, 0, True)
|
||||
@@ -159,16 +163,16 @@ class Toolbox:
|
||||
|
||||
# Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
|
||||
# playback, so as to have a fair comparison with the generated audio
|
||||
wav = Synthesizer.load_preprocess_wav(fpath)
|
||||
wav = Synthesizer_infer.load_preprocess_wav(fpath)
|
||||
self.ui.log("Loaded %s" % name)
|
||||
|
||||
self.add_real_utterance(wav, name, speaker_name)
|
||||
|
||||
def record(self):
|
||||
wav = self.ui.record_one(encoder.sampling_rate, 5)
|
||||
wav = self.ui.record_one(encoder_infer.sampling_rate, 5)
|
||||
if wav is None:
|
||||
return
|
||||
self.ui.play(wav, encoder.sampling_rate)
|
||||
self.ui.play(wav, encoder_infer.sampling_rate)
|
||||
|
||||
speaker_name = "user01"
|
||||
name = speaker_name + "_rec_%05d" % np.random.randint(100000)
|
||||
@@ -176,14 +180,15 @@ class Toolbox:
|
||||
|
||||
def add_real_utterance(self, wav, name, speaker_name):
|
||||
# Compute the mel spectrogram
|
||||
spec = Synthesizer.make_spectrogram(wav)
|
||||
spec = Synthesizer_infer.make_spectrogram(wav)
|
||||
self.ui.draw_spec(spec, "current")
|
||||
|
||||
# Compute the embedding
|
||||
if not encoder.is_loaded():
|
||||
if not encoder_infer.is_loaded():
|
||||
self.init_encoder()
|
||||
encoder_wav = encoder.preprocess_wav(wav)
|
||||
embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
|
||||
encoder_wav = encoder_infer.preprocess_wav(wav)
|
||||
embed, partial_embeds, _ = encoder_infer.embed_utterance(encoder_wav, return_partials=True)
|
||||
embed[embed < encoder.params_data.set_zero_thres]=0 # 噪声值置零
|
||||
|
||||
# Add the utterance
|
||||
utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, False)
|
||||
@@ -217,11 +222,20 @@ class Toolbox:
|
||||
if self.synthesizer is None or seed is not None:
|
||||
self.init_synthesizer()
|
||||
|
||||
texts = re.split("\.|,|!|\?|;|:|\n", self.ui.text_prompt.toPlainText())
|
||||
texts = list(filter(None, texts))
|
||||
embed = self.ui.selected_utterance.embed
|
||||
|
||||
def preprocess_text(text):
|
||||
text = add_breaks(text)
|
||||
text = english_cleaners_predict(text)
|
||||
texts = [i.text.strip() for i in self.nlp(text).sents] # split paragraph to sentences
|
||||
return texts
|
||||
|
||||
texts = preprocess_text(self.ui.text_prompt.toPlainText())
|
||||
print(f"the list of inputs texts:\n{texts}")
|
||||
|
||||
embeds = [embed] * len(texts)
|
||||
specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
|
||||
specs, alignments, stop_tokens = self.synthesizer.synthesize_spectrograms(texts, embeds, require_visualization=True)
|
||||
|
||||
breaks = [spec.shape[1] for spec in specs]
|
||||
spec = np.concatenate(specs, axis=1)
|
||||
|
||||
@@ -248,35 +262,35 @@ class Toolbox:
|
||||
self.init_vocoder()
|
||||
|
||||
def vocoder_progress(i, seq_len, b_size, gen_rate):
|
||||
real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
|
||||
real_time_factor = (gen_rate / Synthesizer_infer.sample_rate) * 1000
|
||||
line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
|
||||
% (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
|
||||
self.ui.log(line, "overwrite")
|
||||
self.ui.set_loading(i, seq_len)
|
||||
if self.ui.current_vocoder_fpath is not None and not self.ui.griffin_lim_checkbox.isChecked():
|
||||
self.ui.log("")
|
||||
wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)
|
||||
wav = vocoder.infer_waveform(spec, target=vocoder.hp.voc_target, overlap=vocoder.hp.voc_overlap, crossfade=vocoder.hp.is_crossfade)
|
||||
else:
|
||||
self.ui.log("Waveform generation with Griffin-Lim... ")
|
||||
wav = Synthesizer.griffin_lim(spec)
|
||||
wav = Synthesizer_infer.griffin_lim(spec)
|
||||
self.ui.set_loading(0)
|
||||
self.ui.log(" Done!", "append")
|
||||
self.ui.log(f"Generate time: {time.time() - self.start_generate_time}s")
|
||||
|
||||
# Add breaks
|
||||
b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
|
||||
b_ends = np.cumsum(np.array(breaks) * Synthesizer_infer.hparams.hop_size)
|
||||
b_starts = np.concatenate(([0], b_ends[:-1]))
|
||||
wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
|
||||
breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
|
||||
breaks = [np.zeros(int(0.15 * Synthesizer_infer.sample_rate))] * len(breaks)
|
||||
wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
|
||||
|
||||
# Trim excessive silences
|
||||
if self.ui.trim_silences_checkbox.isChecked():
|
||||
wav = encoder.preprocess_wav(wav)
|
||||
wav = encoder_infer.preprocess_wav(wav)
|
||||
|
||||
# Play it
|
||||
wav = wav / np.abs(wav).max() * 0.5
|
||||
self.ui.play(wav, Synthesizer.sample_rate)
|
||||
wav = wav / np.abs(wav).max() * 4
|
||||
self.ui.play(wav, Synthesizer_infer.sample_rate)
|
||||
|
||||
# Name it (history displayed in combobox)
|
||||
# TODO better naming for the combobox items?
|
||||
@@ -304,10 +318,10 @@ class Toolbox:
|
||||
|
||||
# Compute the embedding
|
||||
# TODO: this is problematic with different sampling rates, gotta fix it
|
||||
if not encoder.is_loaded():
|
||||
if not encoder_infer.is_loaded():
|
||||
self.init_encoder()
|
||||
encoder_wav = encoder.preprocess_wav(wav)
|
||||
embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
|
||||
encoder_wav = encoder_infer.preprocess_wav(wav)
|
||||
embed, partial_embeds, _ = encoder_infer.embed_utterance(encoder_wav, return_partials=True)
|
||||
|
||||
# Add the utterance
|
||||
name = speaker_name + "_gen_%05d" % np.random.randint(100000)
|
||||
@@ -324,7 +338,7 @@ class Toolbox:
|
||||
self.ui.log("Loading the encoder %s... " % model_fpath)
|
||||
self.ui.set_loading(1)
|
||||
start = timer()
|
||||
encoder.load_model(model_fpath)
|
||||
encoder_infer.load_model(model_fpath)
|
||||
self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
|
||||
self.ui.set_loading(0)
|
||||
|
||||
@@ -334,7 +348,7 @@ class Toolbox:
|
||||
self.ui.log("Loading the synthesizer %s... " % model_fpath)
|
||||
self.ui.set_loading(1)
|
||||
start = timer()
|
||||
self.synthesizer = Synthesizer(model_fpath)
|
||||
self.synthesizer = Synthesizer_infer(model_fpath)
|
||||
self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
|
||||
self.ui.set_loading(0)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from synthesizer.hparams import hparams as _syn_hp
|
||||
from synthesizer.hparams import syn_hparams as _syn_hp
|
||||
|
||||
|
||||
# Audio settings------------------------------------------------------------------------
|
||||
|
||||
@@ -2,7 +2,7 @@ import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from synthesizer.hparams import hparams
|
||||
from synthesizer.hparams import syn_hparams
|
||||
from synthesizer.synthesize import run_synthesis
|
||||
from utils.argutils import print_args
|
||||
|
||||
@@ -33,7 +33,7 @@ if __name__ == "__main__":
|
||||
"If True, processing is done on CPU, even when a GPU is available.")
|
||||
args = parser.parse_args()
|
||||
print_args(args, parser)
|
||||
modified_hp = hparams.parse(args.hparams)
|
||||
modified_hp = syn_hparams.parse(args.hparams)
|
||||
|
||||
if not hasattr(args, "in_dir"):
|
||||
args.in_dir = args.datasets_root / "SV2TTS" / "synthesizer"
|
||||
|
||||
Reference in New Issue
Block a user