diff --git a/demo_cli.py b/demo_cli.py index 897e492..e1f1736 100644 --- a/demo_cli.py +++ b/demo_cli.py @@ -236,7 +236,7 @@ if __name__ == '__main__': start_syn = time.time() # Generating the spectrogram # text = input("Write a sentence to be synthesized:\n") - text = "The North Wind and the Sun were disputing which was the stronger, when a traveler came along wrapped in a warm cloak. They agreed that the one who first succeeded in making the traveler take his cloak off should be considered stronger than the other. Then the North Wind blew as hard as he could, but the more he blew the more closely did the traveler fold his cloak around him; and at last the North Wind gave up the attempt. Then the Sun shined out warmly, and immediately the traveler took off his cloak.And so the North Wind was obliged to confess that the Sun was the stronger of the two." + text = "Mechanics is an essential branch of physics that provides a framework for understanding the behavior of physical bodies under the influence of various forces. The principles of mechanics are based on the laws of motion, which form the foundation of the field. Mechanics has many practical applications in engineering and technology, from aerospace and automotive engineering to robotics and manufacturing. As science and technology continue to evolve, the principles of mechanics will remain an important part of our understanding of the physical world." # If seed is specified, reset torch seed and force synthesizer reload if args.seed is not None: @@ -307,7 +307,7 @@ if __name__ == '__main__': wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) # Trim excess silences to compensate for gaps in spectrograms (issue #53) - generated_wav = encoder.inference.preprocess_wav(wav) + # generated_wav = encoder.inference.preprocess_wav(wav) wav = wav / np.abs(wav).max() * 4 # Save it on the disk diff --git a/encoder/params_data.py b/encoder/params_data.py index 105ab9e..ace11b4 100644 --- a/encoder/params_data.py +++ b/encoder/params_data.py @@ -30,5 +30,5 @@ audio_norm_target_dBFS = -30 # 判断用户输入语音为男声或女声的分界频率 split_freq = 170 # embed去噪置零的阈值 -set_zero_thres=0.06 +set_zero_thres=0.04 diff --git a/requirements.txt b/requirements.txt index f85ec72..0bbf461 100644 Binary files a/requirements.txt and b/requirements.txt differ diff --git a/synthesizer/models/tacotron.py b/synthesizer/models/tacotron.py index eedd3b5..999d321 100644 --- a/synthesizer/models/tacotron.py +++ b/synthesizer/models/tacotron.py @@ -458,7 +458,7 @@ class Tacotron(nn.Module): if t == 0: first_stop_token = stop_tokens[0] # Stop the loop when all stop tokens in batch exceed threshold compared with the 1st token and the sequence's length exceeds threshold - if (stop_tokens > first_stop_token * 2e3).all() and t > (20 * self.r): break + if (stop_tokens > first_stop_token * 1e4).all() and t > (20 * self.r): break # if (stop_tokens > 0.5).all() and t > (20 * self.r): break if torch.cuda.is_available(): torch.cuda.empty_cache() diff --git a/vocoder/inference.py b/vocoder/inference.py index 4fd0114..575df4d 100644 --- a/vocoder/inference.py +++ b/vocoder/inference.py @@ -1,8 +1,12 @@ from vocoder.models.fatchord_version import WaveRNN from vocoder import hparams as hp from scipy.fft import rfft, rfftfreq -import torch +from scipy import signal +from denoiser.pretrained import master64 +import librosa import numpy as np +import torch +import torchaudio import noisereduce as nr @@ -68,15 +72,15 @@ def infer_waveform(mel, normalize=True, batched=True, target=8000, overlap=800, return wav def waveform_denoising(wav): - fft_max_freq = get_dominant_freq(wav) prop_decrease = hp.prop_decrease_low_freq if hp.sex else hp.prop_decrease_high_freq - # prop_decrease = 0.6 for low freq audio - # prop_decrease = 0.9 for high freq audio - print(f"\nthe dominant frequency of output audio is {fft_max_freq}Hz") - - wav = nr.reduce_noise(wav, hp.sample_rate, prop_decrease=prop_decrease) - - return wav + if torch.cuda.is_available(): + _device = torch.device('cuda') + else: + _device = torch.device('cpu') + model = master64().to(_device) + noisy=torch.from_numpy(np.array([wav])).to(_device).float() + estimate = model(noisy)[0].cpu().detach().numpy() + return nr.reduce_noise(np.squeeze(estimate), hp.sample_rate, prop_decrease=prop_decrease) def get_dominant_freq(wav, name="fft"): import matplotlib.pyplot as plt diff --git a/vocoder/models/fatchord_version.py b/vocoder/models/fatchord_version.py index 151b557..8c47aec 100644 --- a/vocoder/models/fatchord_version.py +++ b/vocoder/models/fatchord_version.py @@ -250,9 +250,10 @@ class WaveRNN(nn.Module): output = de_emphasis(output) # Fade-out at the end to avoid signal cutting out suddenly - fade_out = np.linspace(1, 0, 20 * self.hop_length) + fade_out_len = min(wave_len, 20 * self.hop_length) + fade_out = np.linspace(1, 0, fade_out_len) output = output[:wave_len] - output[-20 * self.hop_length:] *= fade_out + output[-fade_out_len:] *= fade_out self.train()