From 2c4bbbf9b9a25dc952eeebed9ebfced86429c992 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 6 Sep 2021 14:29:22 +0000 Subject: [PATCH] Use pyworld for pitch --- TTS/utils/audio.py | 55 ++------ TTS/utils/yin.py | 118 ------------------ TTS/vocoder/models/__init__.py | 2 +- .../ljspeech/fast_pitch/train_fast_pitch.py | 1 + requirements.txt | 1 + tests/__init__.py | 5 +- 6 files changed, 19 insertions(+), 163 deletions(-) delete mode 100644 TTS/utils/yin.py diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index 6a74b3c8..01d1f7d1 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -2,6 +2,7 @@ from typing import Dict, Tuple import librosa import numpy as np +import pyworld as pw import scipy.io.wavfile import scipy.signal import soundfile as sf @@ -9,7 +10,6 @@ import torch from torch import nn from TTS.tts.utils.data import StandardScaler -from TTS.utils.yin import compute_yin class TorchSTFT(nn.Module): # pylint: disable=abstract-method @@ -640,59 +640,28 @@ class AudioProcessor(object): >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050] >>> pitch = ap.compute_f0(wav) """ - # f0, t = pw.dio( - # x.astype(np.double), - # fs=self.sample_rate, - # f0_ceil=self.mel_fmax, - # frame_period=1000 * self.hop_length / self.sample_rate, - # ) - # f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate) - # f0, _, _, _ = compute_yin( - # x, - # self.sample_rate, - # self.win_length, - # self.hop_length, - # 65 if self.mel_fmin == 0 else self.mel_fmin, - # self.mel_fmax, - # ) - # # import pyworld as pw - # # f0, _ = pw.dio(x.astype(np.float64), self.sample_rate, - # # frame_period=self.hop_length / self.sample_rate * 1000) + f0, t = pw.dio( + x.astype(np.double), + fs=self.sample_rate, + f0_ceil=self.mel_fmax, + frame_period=1000 * self.hop_length / self.sample_rate, + ) + f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate) # pad = int((self.win_length / self.hop_length) / 2) # f0 = [0.0] * pad + f0 + [0.0] * pad + # f0 = np.pad(f0, (pad, pad), mode="constant", constant_values=0) # f0 = np.array(f0, dtype=np.float32) - f0, _, _ = librosa.pyin( - x, - fmin=65 if self.mel_fmin == 0 else self.mel_fmin, - fmax=self.mel_fmax, - frame_length=self.win_length, - sr=self.sample_rate, - fill_na=0.0, - ) - - # f02 = librosa.yin( + # f01, _, _ = librosa.pyin( # x, # fmin=65 if self.mel_fmin == 0 else self.mel_fmin, # fmax=self.mel_fmax, # frame_length=self.win_length, - # sr=self.sample_rate + # sr=self.sample_rate, + # fill_na=0.0, # ) # spec = self.melspectrogram(x) - - # from matplotlib import pyplot as plt - # plt.figure() - # plt.plot(f0, linewidth=2.5, color='red') - # plt.plot(f01, linewidth=2.5, linestyle='-.') - # plt.plot(f02, linewidth=2.5) - # plt.xlabel('time') - # plt.ylabel('F0') - # plt.savefig('save_img.png') - - # # plt.figure() - # plt.imshow(spec, aspect="auto", origin="lower") - # plt.savefig('save_img2.png') return f0 ### Audio Processing ### diff --git a/TTS/utils/yin.py b/TTS/utils/yin.py deleted file mode 100644 index 3d8bf64b..00000000 --- a/TTS/utils/yin.py +++ /dev/null @@ -1,118 +0,0 @@ -# adapted from https://github.com/patriceguyot/Yin - -import numpy as np - - -def differenceFunction(x, N, tau_max): - """ - Compute difference function of data x. This corresponds to equation (6) in [1] - This solution is implemented directly with Numpy fft. - - - :param x: audio data - :param N: length of data - :param tau_max: integration window size - :return: difference function - :rtype: list - """ - - x = np.array(x, np.float64) - w = x.size - tau_max = min(tau_max, w) - x_cumsum = np.concatenate((np.array([0.0]), (x * x).cumsum())) - size = w + tau_max - p2 = (size // 32).bit_length() - nice_numbers = (16, 18, 20, 24, 25, 27, 30, 32) - size_pad = min(x * 2 ** p2 for x in nice_numbers if x * 2 ** p2 >= size) - fc = np.fft.rfft(x, size_pad) - conv = np.fft.irfft(fc * fc.conjugate())[:tau_max] - return x_cumsum[w : w - tau_max : -1] + x_cumsum[w] - x_cumsum[:tau_max] - 2 * conv - - -def cumulativeMeanNormalizedDifferenceFunction(df, N): - """ - Compute cumulative mean normalized difference function (CMND). - - This corresponds to equation (8) in [1] - - :param df: Difference function - :param N: length of data - :return: cumulative mean normalized difference function - :rtype: list - """ - - cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) # scipy method - return np.insert(cmndf, 0, 1) - - -def getPitch(cmdf, tau_min, tau_max, harmo_th=0.1): - """ - Return fundamental period of a frame based on CMND function. - - :param cmdf: Cumulative Mean Normalized Difference function - :param tau_min: minimum period for speech - :param tau_max: maximum period for speech - :param harmo_th: harmonicity threshold to determine if it is necessary to compute pitch frequency - :return: fundamental period if there is values under threshold, 0 otherwise - :rtype: float - """ - tau = tau_min - while tau < tau_max: - if cmdf[tau] < harmo_th: - while tau + 1 < tau_max and cmdf[tau + 1] < cmdf[tau]: - tau += 1 - return tau - tau += 1 - - return 0 # if unvoiced - - -def compute_yin(sig, sr, w_len=512, w_step=256, f0_min=100, f0_max=500, harmo_thresh=0.1): - """ - - Compute the Yin Algorithm. Return fundamental frequency and harmonic rate. - - :param sig: Audio signal (list of float) - :param sr: sampling rate (int) - :param w_len: size of the analysis window (samples) - :param w_step: size of the lag between two consecutives windows (samples) - :param f0_min: Minimum fundamental frequency that can be detected (hertz) - :param f0_max: Maximum fundamental frequency that can be detected (hertz) - :param harmo_tresh: Threshold of detection. The yalgorithmù return the first minimum of the CMND function below this treshold. - - :returns: - - * pitches: list of fundamental frequencies, - * harmonic_rates: list of harmonic rate values for each fundamental frequency value (= confidence value) - * argmins: minimums of the Cumulative Mean Normalized DifferenceFunction - * times: list of time of each estimation - :rtype: tuple - """ - - tau_min = int(sr / f0_max) - tau_max = int(sr / f0_min) - - timeScale = range(0, len(sig) - w_len, w_step) # time values for each analysis window - times = [t / float(sr) for t in timeScale] - frames = [sig[t : t + w_len] for t in timeScale] - - pitches = [0.0] * len(timeScale) - harmonic_rates = [0.0] * len(timeScale) - argmins = [0.0] * len(timeScale) - - for i, frame in enumerate(frames): - # Compute YIN - df = differenceFunction(frame, w_len, tau_max) - cmdf = cumulativeMeanNormalizedDifferenceFunction(df, tau_max) - p = getPitch(cmdf, tau_min, tau_max, harmo_thresh) - - # Get results - if np.argmin(cmdf) > tau_min: - argmins[i] = float(sr / np.argmin(cmdf)) - if p != 0: # A pitch was found - pitches[i] = float(sr / p) - harmonic_rates[i] = cmdf[p] - else: # No pitch, but we compute a value of the harmonic rate - harmonic_rates[i] = min(cmdf) - - return pitches, harmonic_rates, argmins, times diff --git a/TTS/vocoder/models/__init__.py b/TTS/vocoder/models/__init__.py index edc94d72..a70ebe40 100644 --- a/TTS/vocoder/models/__init__.py +++ b/TTS/vocoder/models/__init__.py @@ -11,7 +11,6 @@ def to_camel(text): def setup_model(config: Coqpit): """Load models directly from configuration.""" - print(" > Vocoder Model: {}".format(config.model)) if "discriminator_model" in config and "generator_model" in config: MyModel = importlib.import_module("TTS.vocoder.models.gan") MyModel = getattr(MyModel, "GAN") @@ -28,6 +27,7 @@ def setup_model(config: Coqpit): MyModel = getattr(MyModel, to_camel(config.model)) except ModuleNotFoundError as e: raise ValueError(f"Model {config.model} not exist!") from e + print(" > Vocoder Model: {}".format(config.model)) model = MyModel(config) return model diff --git a/recipes/ljspeech/fast_pitch/train_fast_pitch.py b/recipes/ljspeech/fast_pitch/train_fast_pitch.py index 5c9e67da..614e42e0 100644 --- a/recipes/ljspeech/fast_pitch/train_fast_pitch.py +++ b/recipes/ljspeech/fast_pitch/train_fast_pitch.py @@ -43,6 +43,7 @@ config = FastPitchConfig( epochs=1000, text_cleaner="english_cleaners", use_phonemes=True, + use_espeak_phonemes=False, phoneme_language="en-us", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), print_step=50, diff --git a/requirements.txt b/requirements.txt index b92947a0..a87a3c6f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,3 +25,4 @@ unidic-lite==1.0.8 # gruut+supported langs gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0 fsspec>=2021.04.0 +pyworld \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py index a7878132..2b07004f 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -7,7 +7,10 @@ from TTS.utils.generic_utils import get_cuda def get_device_id(): use_cuda, _ = get_cuda() if use_cuda: - GPU_ID = "0" + if 'CUDA_VISIBLE_DEVICES' in os.environ and os.environ['CUDA_VISIBLE_DEVICES'] != "": + GPU_ID = os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0] + else: + GPU_ID = "0" else: GPU_ID = "" return GPU_ID