From 2c4bbbf9b9a25dc952eeebed9ebfced86429c992 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Mon, 6 Sep 2021 14:29:22 +0000
Subject: [PATCH] Use pyworld for pitch

---
 TTS/utils/audio.py                            |  55 ++------
 TTS/utils/yin.py                              | 118 ------------------
 TTS/vocoder/models/__init__.py                |   2 +-
 .../ljspeech/fast_pitch/train_fast_pitch.py   |   1 +
 requirements.txt                              |   1 +
 tests/__init__.py                             |   5 +-
 6 files changed, 19 insertions(+), 163 deletions(-)
 delete mode 100644 TTS/utils/yin.py

diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py
index 6a74b3c8..01d1f7d1 100644
--- a/TTS/utils/audio.py
+++ b/TTS/utils/audio.py
@@ -2,6 +2,7 @@ from typing import Dict, Tuple
 
 import librosa
 import numpy as np
+import pyworld as pw
 import scipy.io.wavfile
 import scipy.signal
 import soundfile as sf
@@ -9,7 +10,6 @@ import torch
 from torch import nn
 
 from TTS.tts.utils.data import StandardScaler
-from TTS.utils.yin import compute_yin
 
 
 class TorchSTFT(nn.Module):  # pylint: disable=abstract-method
@@ -640,59 +640,28 @@ class AudioProcessor(object):
             >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
             >>> pitch = ap.compute_f0(wav)
         """
-        # f0, t = pw.dio(
-        #     x.astype(np.double),
-        #     fs=self.sample_rate,
-        #     f0_ceil=self.mel_fmax,
-        #     frame_period=1000 * self.hop_length / self.sample_rate,
-        # )
-        # f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
-        # f0, _, _, _ = compute_yin(
-        #     x,
-        #     self.sample_rate,
-        #     self.win_length,
-        #     self.hop_length,
-        #     65 if self.mel_fmin == 0 else self.mel_fmin,
-        #     self.mel_fmax,
-        # )
-        # # import pyworld as pw
-        # # f0, _ = pw.dio(x.astype(np.float64), self.sample_rate,
-        # #                   frame_period=self.hop_length / self.sample_rate * 1000)
+        f0, t = pw.dio(
+            x.astype(np.double),
+            fs=self.sample_rate,
+            f0_ceil=self.mel_fmax,
+            frame_period=1000 * self.hop_length / self.sample_rate,
+        )
+        f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
         # pad = int((self.win_length / self.hop_length) / 2)
         # f0 = [0.0] * pad + f0 + [0.0] * pad
+        # f0 = np.pad(f0, (pad, pad), mode="constant", constant_values=0)
         # f0 = np.array(f0, dtype=np.float32)
 
-        f0, _, _ = librosa.pyin(
-            x,
-            fmin=65 if self.mel_fmin == 0 else self.mel_fmin,
-            fmax=self.mel_fmax,
-            frame_length=self.win_length,
-            sr=self.sample_rate,
-            fill_na=0.0,
-        )
-
-        # f02 = librosa.yin(
+        # f01, _, _ = librosa.pyin(
         #     x,
         #     fmin=65 if self.mel_fmin == 0 else self.mel_fmin,
         #     fmax=self.mel_fmax,
         #     frame_length=self.win_length,
-        #     sr=self.sample_rate
+        #     sr=self.sample_rate,
+        #     fill_na=0.0,
         # )
 
         # spec = self.melspectrogram(x)
-
-        # from matplotlib import pyplot as plt
-        # plt.figure()
-        # plt.plot(f0, linewidth=2.5, color='red')
-        # plt.plot(f01, linewidth=2.5, linestyle='-.')
-        # plt.plot(f02, linewidth=2.5)
-        # plt.xlabel('time')
-        # plt.ylabel('F0')
-        # plt.savefig('save_img.png')
-
-        # # plt.figure()
-        # plt.imshow(spec, aspect="auto", origin="lower")
-        # plt.savefig('save_img2.png')
         return f0
 
     ### Audio Processing ###
diff --git a/TTS/utils/yin.py b/TTS/utils/yin.py
deleted file mode 100644
index 3d8bf64b..00000000
--- a/TTS/utils/yin.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# adapted from https://github.com/patriceguyot/Yin
-
-import numpy as np
-
-
-def differenceFunction(x, N, tau_max):
-    """
-    Compute difference function of data x. This corresponds to equation (6) in [1]
-    This solution is implemented directly with Numpy fft.
-
-
-    :param x: audio data
-    :param N: length of data
-    :param tau_max: integration window size
-    :return: difference function
-    :rtype: list
-    """
-
-    x = np.array(x, np.float64)
-    w = x.size
-    tau_max = min(tau_max, w)
-    x_cumsum = np.concatenate((np.array([0.0]), (x * x).cumsum()))
-    size = w + tau_max
-    p2 = (size // 32).bit_length()
-    nice_numbers = (16, 18, 20, 24, 25, 27, 30, 32)
-    size_pad = min(x * 2 ** p2 for x in nice_numbers if x * 2 ** p2 >= size)
-    fc = np.fft.rfft(x, size_pad)
-    conv = np.fft.irfft(fc * fc.conjugate())[:tau_max]
-    return x_cumsum[w : w - tau_max : -1] + x_cumsum[w] - x_cumsum[:tau_max] - 2 * conv
-
-
-def cumulativeMeanNormalizedDifferenceFunction(df, N):
-    """
-    Compute cumulative mean normalized difference function (CMND).
-
-    This corresponds to equation (8) in [1]
-
-    :param df: Difference function
-    :param N: length of data
-    :return: cumulative mean normalized difference function
-    :rtype: list
-    """
-
-    cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float)  # scipy method
-    return np.insert(cmndf, 0, 1)
-
-
-def getPitch(cmdf, tau_min, tau_max, harmo_th=0.1):
-    """
-    Return fundamental period of a frame based on CMND function.
-
-    :param cmdf: Cumulative Mean Normalized Difference function
-    :param tau_min: minimum period for speech
-    :param tau_max: maximum period for speech
-    :param harmo_th: harmonicity threshold to determine if it is necessary to compute pitch frequency
-    :return: fundamental period if there is values under threshold, 0 otherwise
-    :rtype: float
-    """
-    tau = tau_min
-    while tau < tau_max:
-        if cmdf[tau] < harmo_th:
-            while tau + 1 < tau_max and cmdf[tau + 1] < cmdf[tau]:
-                tau += 1
-            return tau
-        tau += 1
-
-    return 0  # if unvoiced
-
-
-def compute_yin(sig, sr, w_len=512, w_step=256, f0_min=100, f0_max=500, harmo_thresh=0.1):
-    """
-
-    Compute the Yin Algorithm. Return fundamental frequency and harmonic rate.
-
-    :param sig: Audio signal (list of float)
-    :param sr: sampling rate (int)
-    :param w_len: size of the analysis window (samples)
-    :param w_step: size of the lag between two consecutives windows (samples)
-    :param f0_min: Minimum fundamental frequency that can be detected (hertz)
-    :param f0_max: Maximum fundamental frequency that can be detected (hertz)
-    :param harmo_tresh: Threshold of detection. The yalgorithmù return the first minimum of the CMND function below this treshold.
-
-    :returns:
-
-        * pitches: list of fundamental frequencies,
-        * harmonic_rates: list of harmonic rate values for each fundamental frequency value (= confidence value)
-        * argmins: minimums of the Cumulative Mean Normalized DifferenceFunction
-        * times: list of time of each estimation
-    :rtype: tuple
-    """
-
-    tau_min = int(sr / f0_max)
-    tau_max = int(sr / f0_min)
-
-    timeScale = range(0, len(sig) - w_len, w_step)  # time values for each analysis window
-    times = [t / float(sr) for t in timeScale]
-    frames = [sig[t : t + w_len] for t in timeScale]
-
-    pitches = [0.0] * len(timeScale)
-    harmonic_rates = [0.0] * len(timeScale)
-    argmins = [0.0] * len(timeScale)
-
-    for i, frame in enumerate(frames):
-        # Compute YIN
-        df = differenceFunction(frame, w_len, tau_max)
-        cmdf = cumulativeMeanNormalizedDifferenceFunction(df, tau_max)
-        p = getPitch(cmdf, tau_min, tau_max, harmo_thresh)
-
-        # Get results
-        if np.argmin(cmdf) > tau_min:
-            argmins[i] = float(sr / np.argmin(cmdf))
-        if p != 0:  # A pitch was found
-            pitches[i] = float(sr / p)
-            harmonic_rates[i] = cmdf[p]
-        else:  # No pitch, but we compute a value of the harmonic rate
-            harmonic_rates[i] = min(cmdf)
-
-    return pitches, harmonic_rates, argmins, times
diff --git a/TTS/vocoder/models/__init__.py b/TTS/vocoder/models/__init__.py
index edc94d72..a70ebe40 100644
--- a/TTS/vocoder/models/__init__.py
+++ b/TTS/vocoder/models/__init__.py
@@ -11,7 +11,6 @@ def to_camel(text):
 
 def setup_model(config: Coqpit):
     """Load models directly from configuration."""
-    print(" > Vocoder Model: {}".format(config.model))
     if "discriminator_model" in config and "generator_model" in config:
         MyModel = importlib.import_module("TTS.vocoder.models.gan")
         MyModel = getattr(MyModel, "GAN")
@@ -28,6 +27,7 @@ def setup_model(config: Coqpit):
                 MyModel = getattr(MyModel, to_camel(config.model))
             except ModuleNotFoundError as e:
                 raise ValueError(f"Model {config.model} not exist!") from e
+    print(" > Vocoder Model: {}".format(config.model))
     model = MyModel(config)
     return model
 
diff --git a/recipes/ljspeech/fast_pitch/train_fast_pitch.py b/recipes/ljspeech/fast_pitch/train_fast_pitch.py
index 5c9e67da..614e42e0 100644
--- a/recipes/ljspeech/fast_pitch/train_fast_pitch.py
+++ b/recipes/ljspeech/fast_pitch/train_fast_pitch.py
@@ -43,6 +43,7 @@ config = FastPitchConfig(
     epochs=1000,
     text_cleaner="english_cleaners",
     use_phonemes=True,
+    use_espeak_phonemes=False,
     phoneme_language="en-us",
     phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
     print_step=50,
diff --git a/requirements.txt b/requirements.txt
index b92947a0..a87a3c6f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -25,3 +25,4 @@ unidic-lite==1.0.8
 # gruut+supported langs
 gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0
 fsspec>=2021.04.0
+pyworld
\ No newline at end of file
diff --git a/tests/__init__.py b/tests/__init__.py
index a7878132..2b07004f 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -7,7 +7,10 @@ from TTS.utils.generic_utils import get_cuda
 def get_device_id():
     use_cuda, _ = get_cuda()
     if use_cuda:
-        GPU_ID = "0"
+        if 'CUDA_VISIBLE_DEVICES' in os.environ and os.environ['CUDA_VISIBLE_DEVICES'] != "":
+            GPU_ID = os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0]
+        else:
+            GPU_ID = "0"
     else:
         GPU_ID = ""
     return GPU_ID