new commits

This commit is contained in:
liuhaozhe6788
2023-06-25 15:44:05 +08:00
parent 06e68fd66f
commit be98fed4be
11 changed files with 102 additions and 87 deletions

View File

@@ -44,7 +44,7 @@ if __name__ == '__main__':
import encoder.inference
import encoder.params_data
from synthesizer.inference import Synthesizer
from synthesizer.inference import Synthesizer_infer
from synthesizer.utils.cleaners import add_breaks, english_cleaners_predict
from vocoder import inference as vocoder
from vocoder.display import save_attention_multiple, save_spectrogram, save_stop_tokens
@@ -73,7 +73,7 @@ if __name__ == '__main__':
print("Preparing the encoder and the synthesizer...")
ensure_default_models(args.run_id, Path("saved_models"))
encoder.inference.load_model(list(args.models_dir.glob(f"{args.run_id}/encoder.pt"))[0])
synthesizer = Synthesizer(list(args.models_dir.glob(f"{args.run_id}/synthesizer.pt"))[0])
synthesizer = Synthesizer_infer(list(args.models_dir.glob(f"{args.run_id}/synthesizer.pt"))[0])
if not args.griffin_lim:
vocoder.load_model(list(args.models_dir.glob(f"{args.run_id}/vocoder.pt"))[0])
@@ -201,7 +201,7 @@ if __name__ == '__main__':
if os.path.exists(standard_fpath):
standard_wav = Synthesizer.load_preprocess_wav(standard_fpath)
standard_wav = Synthesizer_infer.load_preprocess_wav(standard_fpath)
preprocessed_standard_wav = encoder.inference.preprocess_wav(standard_wav)
print("Loaded standard audio file successfully")
@@ -223,7 +223,7 @@ if __name__ == '__main__':
# If seed is specified, reset torch seed and force synthesizer reload
if args.seed is not None:
torch.manual_seed(args.seed)
synthesizer = Synthesizer(args.syn_model_fpath)
synthesizer = Synthesizer_infer(args.syn_model_fpath)
# The synthesizer works in batch, so you need to put your data in a list or numpy array
def preprocess_text(text):
@@ -267,17 +267,17 @@ if __name__ == '__main__':
if not args.griffin_lim:
wav = vocoder.infer_waveform(spec, target=vocoder.hp.voc_target, overlap=vocoder.hp.voc_overlap, crossfade=vocoder.hp.is_crossfade)
else:
wav = Synthesizer.griffin_lim(spec)
wav = Synthesizer_infer.griffin_lim(spec)
end_voc = time.time()
print(f"Prediction time of vocoder is {end_voc - start_voc}s")
print(f"Prediction time of TTS is {end_voc - start_syn}s")
# Add breaks
b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
b_ends = np.cumsum(np.array(breaks) * Synthesizer_infer.hparams.hop_size)
b_starts = np.concatenate(([0], b_ends[:-1]))
wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
breaks = [np.zeros(int(0.15 * Synthesizer_infer.sample_rate))] * len(breaks)
wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
# Trim excess silences to compensate for gaps in spectrograms (issue #53)

View File

@@ -1,5 +1,6 @@
import argparse
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
from pathlib import Path
from toolbox import Toolbox
@@ -12,7 +13,7 @@ if __name__ == '__main__':
description="Runs the toolbox.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("--run_id", type=str, default="default", help= \
parser.add_argument("--run_id", type=str, default="20230609", help= \
"Name for this model. By default, training outputs will be stored to saved_models/<run_id>/. If a model state "
"from the same run ID was previously saved, the training will restart from there. Pass -f to overwrite saved "
"states and restart from scratch.")

View File

@@ -3,8 +3,8 @@ from ffmpeg import audio
from pathlib import Path
import numpy as np
import parselmouth
from synthesizer.inference import Synthesizer
from synthesizer.hparams import hparams
from synthesizer.inference import Synthesizer_infer
from synthesizer.hparams import syn_hparams
import soundfile as sf
from parselmouth.praat import run_file
@@ -67,13 +67,13 @@ def TransFormat(fullpath, out_suffix):
is_wav_file = False # 原始音频的后缀是否为.wav
path_, name = os.path.split(fullpath)
name, suffix = os.path.splitext(name)
wav = Synthesizer.load_preprocess_wav(fullpath)
wav = Synthesizer_infer.load_preprocess_wav(fullpath)
if suffix == ".wav": # 如果原始音频的后缀为.wav则不用进行格式转换
is_wav_file = True
return is_wav_file, wav, str(fullpath)
else: # 如果原始音频的后缀不是.wav则需要进行格式转换
out_file = os.path.join(path_, name + "." + str(out_suffix))
sf.write(out_file, wav.astype(np.float32), hparams.sample_rate)
sf.write(out_file, wav.astype(np.float32), syn_hparams.sample_rate)
return is_wav_file, wav, str(out_file)

View File

@@ -18,7 +18,7 @@ class HParams(object):
self.__dict__[k] = ast.literal_eval(values[keys.index(k)])
return self
hparams = HParams(
syn_hparams = HParams(
### Signal Processing (used in both synthesizer and vocoder)
sample_rate = 16000,
n_fft = 800,
@@ -89,4 +89,4 @@ hparams = HParams(
)
def hparams_debug_string():
return str(hparams)
return str(syn_hparams)

View File

@@ -1,6 +1,6 @@
import torch
from synthesizer import audio
from synthesizer.hparams import hparams
from synthesizer.hparams import syn_hparams
from synthesizer.models.tacotron import Tacotron
from synthesizer.utils.symbols import symbols
from synthesizer.utils.text import text_to_sequence
@@ -11,9 +11,9 @@ import numpy as np
import librosa
class Synthesizer:
sample_rate = hparams.sample_rate
hparams = hparams
class Synthesizer_infer:
sample_rate = syn_hparams.sample_rate
hparams = syn_hparams
def __init__(self, model_fpath: Path, verbose=True):
"""
@@ -46,20 +46,20 @@ class Synthesizer:
"""
Instantiates and loads the model given the weights file that was passed in the constructor.
"""
self._model = Tacotron(embed_dims=hparams.tts_embed_dims,
self._model = Tacotron(embed_dims=syn_hparams.tts_embed_dims,
num_chars=len(symbols),
encoder_dims=hparams.tts_encoder_dims,
decoder_dims=hparams.tts_decoder_dims,
n_mels=hparams.num_mels,
fft_bins=hparams.num_mels,
postnet_dims=hparams.tts_postnet_dims,
encoder_K=hparams.tts_encoder_K,
lstm_dims=hparams.tts_lstm_dims,
postnet_K=hparams.tts_postnet_K,
num_highways=hparams.tts_num_highways,
dropout=hparams.tts_dropout,
stop_threshold=hparams.tts_stop_threshold,
speaker_embedding_size=hparams.speaker_embedding_size).to(self.device)
encoder_dims=syn_hparams.tts_encoder_dims,
decoder_dims=syn_hparams.tts_decoder_dims,
n_mels=syn_hparams.num_mels,
fft_bins=syn_hparams.num_mels,
postnet_dims=syn_hparams.tts_postnet_dims,
encoder_K=syn_hparams.tts_encoder_K,
lstm_dims=syn_hparams.tts_lstm_dims,
postnet_K=syn_hparams.tts_postnet_K,
num_highways=syn_hparams.tts_num_highways,
dropout=syn_hparams.tts_dropout,
stop_threshold=syn_hparams.tts_stop_threshold,
speaker_embedding_size=syn_hparams.speaker_embedding_size).to(self.device)
self._model.load(self.model_fpath)
self._model.eval()
@@ -91,10 +91,10 @@ class Synthesizer:
embeddings = [embeddings]
# Batch inputs
batched_inputs = [inputs[i:i+hparams.synthesis_batch_size]
for i in range(0, len(inputs), hparams.synthesis_batch_size)]
batched_embeds = [embeddings[i:i+hparams.synthesis_batch_size]
for i in range(0, len(embeddings), hparams.synthesis_batch_size)]
batched_inputs = [inputs[i:i+syn_hparams.synthesis_batch_size]
for i in range(0, len(inputs), syn_hparams.synthesis_batch_size)]
batched_embeds = [embeddings[i:i+syn_hparams.synthesis_batch_size]
for i in range(0, len(embeddings), syn_hparams.synthesis_batch_size)]
specs = []
for i, batch in enumerate(batched_inputs, 1):
@@ -121,12 +121,12 @@ class Synthesizer:
stop_tokens = stop_tokens.detach().cpu().numpy()
for m in mels:
# Trim silence from end of each spectrogram
while np.max(m[:, -1]) < hparams.tts_stop_threshold:
while np.max(m[:, -1]) < syn_hparams.tts_stop_threshold:
if m.shape[-1] == 1:
break
m = m[:, :-1]
# Trim silence from start of each spectrogram
while np.max(m[:, 0]) < hparams.tts_start_threshold:
while np.max(m[:, 0]) < syn_hparams.tts_start_threshold:
if m.shape[-1] == 1:
break
m = m[:, 1:]
@@ -142,9 +142,9 @@ class Synthesizer:
Loads and preprocesses an audio file under the same conditions the audio files were used to
train the synthesizer.
"""
wav = librosa.load(str(fpath), hparams.sample_rate)[0]
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
wav = librosa.load(str(fpath), syn_hparams.sample_rate)[0]
if syn_hparams.rescale:
wav = wav / np.abs(wav).max() * syn_hparams.rescaling_max
return wav
@staticmethod
@@ -154,11 +154,11 @@ class Synthesizer:
were fed to the synthesizer when training.
"""
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
wav = Synthesizer_infer.load_preprocess_wav(fpath_or_wav)
else:
wav = fpath_or_wav
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
mel_spectrogram = audio.melspectrogram(wav, syn_hparams).astype(np.float32)
return mel_spectrogram
@staticmethod
@@ -167,7 +167,7 @@ class Synthesizer:
Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
with the same parameters present in hparams.py.
"""
return audio.inv_mel_spectrogram(mel, hparams)
return audio.inv_mel_spectrogram(mel, syn_hparams)
def pad1d(x, max_len, pad_value=0):

View File

@@ -2,7 +2,7 @@ from multiprocessing.pool import Pool
from synthesizer import audio
from functools import partial
from itertools import chain, groupby
from encoder import inference as encoder
from encoder import inference as encoder_infer
from pathlib import Path
from utils import logmmse
from tqdm import tqdm
@@ -330,7 +330,7 @@ def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
return None
# Trim silence
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=trim_silence)
wav = encoder_infer.preprocess_wav(wav, normalize=False, trim_silence=trim_silence)
# Skip utterances that are too short
if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
@@ -353,14 +353,14 @@ def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
def embed_utterance(fpaths, encoder_model_fpath):
if not encoder.is_loaded():
encoder.load_model(encoder_model_fpath)
if not encoder_infer.is_loaded():
encoder_infer.load_model(encoder_model_fpath)
# Compute the speaker embedding of the utterance
wav_fpath, embed_fpath = fpaths
wav = np.load(wav_fpath)
wav = encoder.preprocess_wav(wav)
embed = encoder.embed_utterance(wav)
wav = encoder_infer.preprocess_wav(wav)
embed = encoder_infer.embed_utterance(wav)
np.save(embed_fpath, embed, allow_pickle=False)

View File

@@ -1,5 +1,5 @@
from synthesizer.preprocess import preprocess_librispeech, preprocess_vctk
from synthesizer.hparams import hparams
from synthesizer.hparams import syn_hparams
from utils.argutils import print_args
from pathlib import Path
import argparse
@@ -40,7 +40,7 @@ if __name__ == "__main__":
# Preprocess the dataset
print_args(args, parser)
args.hparams = hparams.parse(args.hparams)
args.hparams = syn_hparams.parse(args.hparams)
preprocess_func = {
"LibriSpeech": preprocess_librispeech,
"VCTK": preprocess_vctk,

View File

@@ -1,6 +1,6 @@
from pathlib import Path
from synthesizer.hparams import hparams
from synthesizer.hparams import syn_hparams
from synthesizer.train import train
from utils.argutils import print_args
import argparse
@@ -32,7 +32,7 @@ if __name__ == "__main__":
args = parser.parse_args()
print_args(args, parser)
args.hparams = hparams.parse(args.hparams)
args.hparams = syn_hparams.parse(args.hparams)
# Run the training
train(**vars(args))

View File

@@ -7,10 +7,13 @@ import re
import numpy as np
import torch
import soundfile as sf
import spacy
from encoder import inference as encoder
from synthesizer.inference import Synthesizer
from synthesizer.hparams import hparams
import encoder
from encoder import inference as encoder_infer
from synthesizer.inference import Synthesizer_infer
from synthesizer.utils.cleaners import add_breaks, english_cleaners_predict
from synthesizer.hparams import syn_hparams
from toolbox.ui import UI
from toolbox.utterance import Utterance
from vocoder import inference as vocoder
@@ -52,12 +55,13 @@ class Toolbox:
self.utterances = set()
self.current_generated = (None, None, None, None) # speaker_name, spec, breaks, wav
self.synthesizer = None # type: Synthesizer
self.synthesizer = None # type: Synthesizer_infer
self.current_wav = None
self.waves_list = []
self.waves_count = 0
self.waves_namelist = []
self.start_generate_time = None
self.nlp = spacy.load('en_core_web_sm')
# Check for webrtcvad (enables removal of silences in vocoder output)
try:
@@ -100,13 +104,13 @@ class Toolbox:
self.ui.browser_browse_button.clicked.connect(func)
func = lambda: self.ui.draw_utterance(self.ui.selected_utterance, "current")
self.ui.utterance_history.currentIndexChanged.connect(func)
func = lambda: self.ui.play(self.ui.selected_utterance.wav, Synthesizer.sample_rate)
func = lambda: self.ui.play(self.ui.selected_utterance.wav, Synthesizer_infer.sample_rate)
self.ui.play_button.clicked.connect(func)
self.ui.stop_button.clicked.connect(self.ui.stop)
self.ui.record_button.clicked.connect(self.record)
#Audio
self.ui.setup_audio_devices(Synthesizer.sample_rate)
self.ui.setup_audio_devices(Synthesizer_infer.sample_rate)
#Wav playback & save
func = lambda: self.replay_last_wav()
@@ -129,10 +133,10 @@ class Toolbox:
self.current_wav = self.waves_list[index]
def export_current_wave(self):
self.ui.save_audio_file(self.current_wav, Synthesizer.sample_rate)
self.ui.save_audio_file(self.current_wav, Synthesizer_infer.sample_rate)
def replay_last_wav(self):
self.ui.play(self.current_wav, Synthesizer.sample_rate)
self.ui.play(self.current_wav, Synthesizer_infer.sample_rate)
def reset_ui(self, run_id: str, models_dir: Path, seed: int=None):
self.ui.populate_browser(self.datasets_root, recognized_datasets, 0, True)
@@ -159,16 +163,16 @@ class Toolbox:
# Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
# playback, so as to have a fair comparison with the generated audio
wav = Synthesizer.load_preprocess_wav(fpath)
wav = Synthesizer_infer.load_preprocess_wav(fpath)
self.ui.log("Loaded %s" % name)
self.add_real_utterance(wav, name, speaker_name)
def record(self):
wav = self.ui.record_one(encoder.sampling_rate, 5)
wav = self.ui.record_one(encoder_infer.sampling_rate, 5)
if wav is None:
return
self.ui.play(wav, encoder.sampling_rate)
self.ui.play(wav, encoder_infer.sampling_rate)
speaker_name = "user01"
name = speaker_name + "_rec_%05d" % np.random.randint(100000)
@@ -176,14 +180,15 @@ class Toolbox:
def add_real_utterance(self, wav, name, speaker_name):
# Compute the mel spectrogram
spec = Synthesizer.make_spectrogram(wav)
spec = Synthesizer_infer.make_spectrogram(wav)
self.ui.draw_spec(spec, "current")
# Compute the embedding
if not encoder.is_loaded():
if not encoder_infer.is_loaded():
self.init_encoder()
encoder_wav = encoder.preprocess_wav(wav)
embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
encoder_wav = encoder_infer.preprocess_wav(wav)
embed, partial_embeds, _ = encoder_infer.embed_utterance(encoder_wav, return_partials=True)
embed[embed < encoder.params_data.set_zero_thres]=0 # 噪声值置零
# Add the utterance
utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, False)
@@ -217,11 +222,20 @@ class Toolbox:
if self.synthesizer is None or seed is not None:
self.init_synthesizer()
texts = re.split("\.|,|!|\?|;|:|\n", self.ui.text_prompt.toPlainText())
texts = list(filter(None, texts))
embed = self.ui.selected_utterance.embed
def preprocess_text(text):
text = add_breaks(text)
text = english_cleaners_predict(text)
texts = [i.text.strip() for i in self.nlp(text).sents] # split paragraph to sentences
return texts
texts = preprocess_text(self.ui.text_prompt.toPlainText())
print(f"the list of inputs texts:\n{texts}")
embeds = [embed] * len(texts)
specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
specs, alignments, stop_tokens = self.synthesizer.synthesize_spectrograms(texts, embeds, require_visualization=True)
breaks = [spec.shape[1] for spec in specs]
spec = np.concatenate(specs, axis=1)
@@ -248,35 +262,35 @@ class Toolbox:
self.init_vocoder()
def vocoder_progress(i, seq_len, b_size, gen_rate):
real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
real_time_factor = (gen_rate / Synthesizer_infer.sample_rate) * 1000
line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
% (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
self.ui.log(line, "overwrite")
self.ui.set_loading(i, seq_len)
if self.ui.current_vocoder_fpath is not None and not self.ui.griffin_lim_checkbox.isChecked():
self.ui.log("")
wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)
wav = vocoder.infer_waveform(spec, target=vocoder.hp.voc_target, overlap=vocoder.hp.voc_overlap, crossfade=vocoder.hp.is_crossfade)
else:
self.ui.log("Waveform generation with Griffin-Lim... ")
wav = Synthesizer.griffin_lim(spec)
wav = Synthesizer_infer.griffin_lim(spec)
self.ui.set_loading(0)
self.ui.log(" Done!", "append")
self.ui.log(f"Generate time: {time.time() - self.start_generate_time}s")
# Add breaks
b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
b_ends = np.cumsum(np.array(breaks) * Synthesizer_infer.hparams.hop_size)
b_starts = np.concatenate(([0], b_ends[:-1]))
wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
breaks = [np.zeros(int(0.15 * Synthesizer_infer.sample_rate))] * len(breaks)
wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
# Trim excessive silences
if self.ui.trim_silences_checkbox.isChecked():
wav = encoder.preprocess_wav(wav)
wav = encoder_infer.preprocess_wav(wav)
# Play it
wav = wav / np.abs(wav).max() * 0.5
self.ui.play(wav, Synthesizer.sample_rate)
wav = wav / np.abs(wav).max() * 4
self.ui.play(wav, Synthesizer_infer.sample_rate)
# Name it (history displayed in combobox)
# TODO better naming for the combobox items?
@@ -304,10 +318,10 @@ class Toolbox:
# Compute the embedding
# TODO: this is problematic with different sampling rates, gotta fix it
if not encoder.is_loaded():
if not encoder_infer.is_loaded():
self.init_encoder()
encoder_wav = encoder.preprocess_wav(wav)
embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
encoder_wav = encoder_infer.preprocess_wav(wav)
embed, partial_embeds, _ = encoder_infer.embed_utterance(encoder_wav, return_partials=True)
# Add the utterance
name = speaker_name + "_gen_%05d" % np.random.randint(100000)
@@ -324,7 +338,7 @@ class Toolbox:
self.ui.log("Loading the encoder %s... " % model_fpath)
self.ui.set_loading(1)
start = timer()
encoder.load_model(model_fpath)
encoder_infer.load_model(model_fpath)
self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
self.ui.set_loading(0)
@@ -334,7 +348,7 @@ class Toolbox:
self.ui.log("Loading the synthesizer %s... " % model_fpath)
self.ui.set_loading(1)
start = timer()
self.synthesizer = Synthesizer(model_fpath)
self.synthesizer = Synthesizer_infer(model_fpath)
self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
self.ui.set_loading(0)

View File

@@ -1,4 +1,4 @@
from synthesizer.hparams import hparams as _syn_hp
from synthesizer.hparams import syn_hparams as _syn_hp
# Audio settings------------------------------------------------------------------------

View File

@@ -2,7 +2,7 @@ import argparse
import os
from pathlib import Path
from synthesizer.hparams import hparams
from synthesizer.hparams import syn_hparams
from synthesizer.synthesize import run_synthesis
from utils.argutils import print_args
@@ -33,7 +33,7 @@ if __name__ == "__main__":
"If True, processing is done on CPU, even when a GPU is available.")
args = parser.parse_args()
print_args(args, parser)
modified_hp = hparams.parse(args.hparams)
modified_hp = syn_hparams.parse(args.hparams)
if not hasattr(args, "in_dir"):
args.in_dir = args.datasets_root / "SV2TTS" / "synthesizer"