new commits

2025-12-23 23:20:15 +01:00 · 2023-06-19 19:44:56 +08:00
parent f3d34866a8
commit 000c3ad71f
4 changed files with 14 additions and 10 deletions
--- a/demo_cli.py
+++ b/demo_cli.py
@@ -311,7 +311,7 @@ if __name__ == '__main__':
        # Synthesizing the waveform is fairly straightforward. Remember that the longer the
        # spectrogram, the more time-efficient the vocoder.
        if not args.griffin_lim:
-            wav = vocoder.infer_waveform(spec, target=4000, overlap=400)
+            wav = vocoder.infer_waveform(spec, target=vocoder.hp.voc_target, overlap=vocoder.hp.voc_overlap, crossfade=vocoder.hp.is_crossfade) 
        else:
            wav = Synthesizer.griffin_lim(spec)

--- a/vocoder/hparams.py
+++ b/vocoder/hparams.py
@@ -40,8 +40,9 @@ voc_seq_len = hop_length * 5        # must be a multiple of hop_length

 # Generating / Synthesizing
 voc_gen_batched = True              # very fast (realtime+) single utterance batched generation
-voc_target = 8000                   # target number of samples to be generated in each batch entry
+voc_target = 4000                   # target number of samples to be generated in each batch entry
 voc_overlap = 400                   # number of samples for crossfading between batches
+is_crossfade = True                 # crossfading or not

 # Output Noise Reduce
 prop_decrease_low_freq = 0.6        # prop decrease for low dominant frequency
--- a/vocoder/inference.py
+++ b/vocoder/inference.py
@@ -50,7 +50,7 @@ def is_loaded():


 def infer_waveform(mel, normalize=True,  batched=True, target=8000, overlap=800, 
-                   progress_callback=None):
+                   progress_callback=None, crossfade=True):
    """
    Infers the waveform of a mel spectrogram output by the synthesizer (the format must match 
    that of the synthesizer!)
@@ -67,7 +67,7 @@ def infer_waveform(mel, normalize=True,  batched=True, target=8000, overlap=800,
    if normalize:
        mel = mel / hp.mel_max_abs_value
    mel = torch.from_numpy(mel[None, ...])
-    wav = _model.generate(mel, batched, target, overlap, hp.mu_law, progress_callback)
+    wav = _model.generate(mel, batched, target, overlap, hp.mu_law, progress_callback, crossfade=crossfade)
    wav = waveform_denoising(wav)
    return wav

--- a/vocoder/models/fatchord_version.py
+++ b/vocoder/models/fatchord_version.py
@@ -150,7 +150,7 @@ class WaveRNN(nn.Module):
        x = F.relu(self.fc2(x))
        return self.fc3(x)

-    def generate(self, mels, batched, target, overlap, mu_law, progress_callback=None):
+    def generate(self, mels, batched, target, overlap, mu_law, progress_callback=None,crossfade=True):
        mu_law = mu_law if self.mode == 'RAW' else False
        progress_callback = progress_callback or self.gen_display

@@ -240,7 +240,7 @@ class WaveRNN(nn.Module):
        output = output.astype(np.float64)
        
        if batched:
-            output = self.xfade_and_unfold(output, target, overlap)
+            output = self.xfade_and_unfold(output, target, overlap, crossfade=crossfade)
        else:
            output = output[0]

@@ -340,7 +340,7 @@ class WaveRNN(nn.Module):

        return folded

-    def xfade_and_unfold(self, y, target, overlap):
+    def xfade_and_unfold(self, y, target, overlap, crossfade=True):

        ''' Applies a crossfade and unfolds into a 1d array.

@@ -382,9 +382,12 @@ class WaveRNN(nn.Module):
        silence = np.zeros((silence_len), dtype=np.float64)

        # Equal power crossfade
-        t = np.linspace(-1, 1, fade_len, dtype=np.float64)
-        fade_in = np.sqrt(0.5 * (1 + t))
-        fade_out = np.sqrt(0.5 * (1 - t))
+        if crossfade:
+            t = np.linspace(-1, 1, fade_len, dtype=np.float64)
+            fade_in = np.sqrt(0.5 * (1 + t))
+            fade_out = np.sqrt(0.5 * (1 - t))
+        else:
+            fade_in = fade_out = np.ones((fade_len), dtype=np.float64)

        # Concat the silence to the fades
        fade_in = np.concatenate([silence, fade_in])