denoise using fb denoiser

2026-05-18 13:15:06 +02:00 · 2023-06-09 17:49:19 +08:00
parent ba7b119073
commit 11a4045b06
6 changed files with 20 additions and 15 deletions
--- a/demo_cli.py
+++ b/demo_cli.py
@@ -236,7 +236,7 @@ if __name__ == '__main__':
        start_syn = time.time()
        # Generating the spectrogram
        # text = input("Write a sentence to be synthesized:\n")
-        text = "The North Wind and the Sun were disputing which was the stronger, when a traveler came along wrapped in a warm cloak. They agreed that the one who first succeeded in making the traveler take his cloak off should be considered stronger than the other. Then the North Wind blew as hard as he could, but the more he blew the more closely did the traveler fold his cloak around him; and at last the North Wind gave up the attempt. Then the Sun shined out warmly, and immediately the traveler took off his cloak.And so the North Wind was obliged to confess that the Sun was the stronger of the two."
+        text = "Mechanics is an essential branch of physics that provides a framework for understanding the behavior of physical bodies under the influence of various forces. The principles of mechanics are based on the laws of motion, which form the foundation of the field. Mechanics has many practical applications in engineering and technology, from aerospace and automotive engineering to robotics and manufacturing. As science and technology continue to evolve, the principles of mechanics will remain an important part of our understanding of the physical world."

        # If seed is specified, reset torch seed and force synthesizer reload
        if args.seed is not None:
@@ -307,7 +307,7 @@ if __name__ == '__main__':
        wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

        # Trim excess silences to compensate for gaps in spectrograms (issue #53)
-        generated_wav = encoder.inference.preprocess_wav(wav)
+        # generated_wav = encoder.inference.preprocess_wav(wav)
        wav = wav / np.abs(wav).max() * 4

        # Save it on the disk
--- a/encoder/params_data.py
+++ b/encoder/params_data.py
@@ -30,5 +30,5 @@ audio_norm_target_dBFS = -30
 # 判断用户输入语音为男声或女声的分界频率
 split_freq = 170  
 # embed去噪置零的阈值
-set_zero_thres=0.06
+set_zero_thres=0.04

--- a/requirements.txt
+++ b/requirements.txt
--- a/synthesizer/models/tacotron.py
+++ b/synthesizer/models/tacotron.py
@@ -458,7 +458,7 @@ class Tacotron(nn.Module):
            if t == 0:
                first_stop_token = stop_tokens[0]      
            # Stop the loop when all stop tokens in batch exceed threshold compared with the 1st token and the sequence's length exceeds threshold
-            if (stop_tokens > first_stop_token * 2e3).all() and t > (20 * self.r): break
+            if (stop_tokens > first_stop_token * 1e4).all() and t > (20 * self.r): break
            # if (stop_tokens > 0.5).all() and t > (20 * self.r): break
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
--- a/vocoder/inference.py
+++ b/vocoder/inference.py
@@ -1,8 +1,12 @@
 from vocoder.models.fatchord_version import WaveRNN
 from vocoder import hparams as hp
 from scipy.fft import rfft, rfftfreq
-import torch
+from scipy import signal
+from denoiser.pretrained import master64
+import librosa
 import numpy as np
+import torch
+import torchaudio
 import noisereduce as nr    


@@ -68,15 +72,15 @@ def infer_waveform(mel, normalize=True,  batched=True, target=8000, overlap=800,
    return wav

 def waveform_denoising(wav):
-    fft_max_freq = get_dominant_freq(wav)
    prop_decrease = hp.prop_decrease_low_freq if hp.sex else hp.prop_decrease_high_freq
-    # prop_decrease = 0.6 for low freq audio
-    # prop_decrease = 0.9 for high freq audio
-    print(f"\nthe dominant frequency of output audio is {fft_max_freq}Hz")
-
-    wav = nr.reduce_noise(wav, hp.sample_rate, prop_decrease=prop_decrease)
-
-    return wav
+    if torch.cuda.is_available():
+        _device = torch.device('cuda')
+    else:
+        _device = torch.device('cpu')
+    model = master64().to(_device)
+    noisy=torch.from_numpy(np.array([wav])).to(_device).float()
+    estimate = model(noisy)[0].cpu().detach().numpy()
+    return  nr.reduce_noise(np.squeeze(estimate), hp.sample_rate, prop_decrease=prop_decrease) 

 def get_dominant_freq(wav, name="fft"):
    import matplotlib.pyplot as plt
--- a/vocoder/models/fatchord_version.py
+++ b/vocoder/models/fatchord_version.py
@@ -250,9 +250,10 @@ class WaveRNN(nn.Module):
            output = de_emphasis(output)

        # Fade-out at the end to avoid signal cutting out suddenly
-        fade_out = np.linspace(1, 0, 20 * self.hop_length)
+        fade_out_len = min(wave_len, 20 * self.hop_length)
+        fade_out = np.linspace(1, 0, fade_out_len)
        output = output[:wave_len]
-        output[-20 * self.hop_length:] *= fade_out
+        output[-fade_out_len:] *= fade_out
        
        self.train()