ADDED f0 CREPE PITCH EXTRACTION FOR TRAINING. Highly experimental. Need to test how good it is for training models still. Added crepe components to the training tab in the UI. Updated readme.

2026-02-24 03:49:51 +01:00 · 2023-05-05 07:11:23 +10:00
parent c9da303339
commit bbe6bec466
3 changed files with 67 additions and 10 deletions
--- a/README.md
+++ b/README.md
@@ -28,13 +28,14 @@ A fork of an easy-to-use SVC framework based on VITS with top1 retrieval 💯. <

 > High quality licensed song datasets will be added to training-set one after another for your use, without worrying about copyright infringement.
 # Summary 📘
-## Features that this fork (Mangio-RVC-Fork) includes that the original repo doesn't ☑️
+## Features that this fork (Mangio-RVC-Fork) has that the original repo doesn't ☑️
 + Local inference with the conv2d 'Half' exception fix. apply the argument --use_gfloat to infer-web.py to use this fix.
-+ f0 Inference algorithm overhaul:
+ f0 Inference algorithm overhaul: 🌟
  + Added pyworld dio f0 method.
  + Added torchcrepe crepe f0 method. (Increases pitch accuracy and stability ALOT)
  + Modifiable crepe_hop_length for the crepe algorithm via the web_gui
-+ Paperspace integration
+ f0 Crepe Pitch Extraction for training. 🌟 (EXPERIMENTAL)
+ Paperspace integration 🌟
  + Paperspace argument on infer-web.py (--paperspace) that shares a gradio link
  + Make file for paperspace users
 + Tensorboard access via Makefile (make tensorboard)
@@ -49,9 +50,9 @@ A fork of an easy-to-use SVC framework based on VITS with top1 retrieval 💯. <
 + Use the UVR5 model to quickly separate vocals and instruments.

 ## Features planned to be added during the fork's development ▶️
-+ f0 crepe pitch estimation on training.
 + Improved GUI (More convenience).
 + Google colab notebook for this fork.
+ Automatic removal of old generations to save space.

 # Installing the Dependencies 🖥️
 Using pip (python3.9.8 is stable with this fork)
--- a/extract_f0_print.py
+++ b/extract_f0_print.py
@@ -3,6 +3,9 @@ import librosa
 import pyworld
 from scipy.io import wavfile
 import numpy as np, logging
+import torchcrepe # Fork Feature. Crepe algo for training and preprocess
+import torch
+from torch import Tensor # Fork Feature. Used for pitch prediction for torch crepe.

 logging.getLogger("numba").setLevel(logging.WARNING)
 from multiprocessing import Process
@@ -19,6 +22,7 @@ def printt(strr):

 n_p = int(sys.argv[2])
 f0method = sys.argv[3]
+extraction_crepe_hop_length = int(sys.argv[4])


 class FeatureInput(object):
@@ -32,16 +36,16 @@ class FeatureInput(object):
        self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
        self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)

-    def compute_f0(self, path, f0_method):
+    def compute_f0(self, path, f0_method, crepe_hop_length):
        # default resample type of librosa.resample is "soxr_hq".
        # Quality: soxr_vhq > soxr_hq
        x, sr = librosa.load(path, self.fs)  # , res_type='soxr_vhq'
        p_len = x.shape[0] // self.hop
+        f0_min = 50
+        f0_max = 1100
        assert sr == self.fs
        if f0_method == "pm":
            time_step = 160 / 16000 * 1000
-            f0_min = 50
-            f0_max = 1100
            f0 = (
                parselmouth.Sound(x, sr)
                .to_pitch_ac(
@@ -75,6 +79,48 @@ class FeatureInput(object):
                frame_period=1000 * self.hop / sr,
            )
            f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
+        elif f0_method == "crepe": # Fork Feature: Added crepe f0 for f0 feature extraction
+            x = x.astype(np.float32)
+            x /= np.quantile(np.abs(x), 0.999)
+            torch_device_index = 0
+            torch_device = None
+            if torch.cuda.is_available():
+                torch_device = torch.device(f"cuda:{torch_device_index % torch.cuda.device_count()}")
+            elif torch.backends.mps.is_available():
+                torch_device = torch.device("mps")
+            else:
+                torch_device = torch.device("cpu")
+            audio = torch.from_numpy(x).to(torch_device, copy=True)
+            audio = torch.unsqueeze(audio, dim=0)
+            if audio.ndim == 2 and audio.shape[0] > 1:
+                audio = torch.mean(audio, dim=0, keepdim=True).detach()
+            audio = audio.detach()
+            print(
+                "Initiating f0 Crepe Feature Extraction with an extraction_crepe_hop_length of: " +
+                str(crepe_hop_length)
+            )
+            # Pitch prediction for pitch extraction
+            pitch: Tensor = torchcrepe.predict(
+                audio,
+                sr,
+                crepe_hop_length,
+                f0_min,
+                f0_max,
+                "full",
+                batch_size=crepe_hop_length * 2,
+                device=torch_device,
+                pad=True                
+            )
+            p_len = p_len or x.shape[0] // crepe_hop_length
+            # Resize the pitch
+            source = np.array(pitch.squeeze(0).cpu().float().numpy())
+            source[source < 0.001] = np.nan
+            target = np.interp(
+                np.arange(0, len(source) * p_len, len(source)) / p_len,
+                np.arange(0, len(source)),
+                source
+            )
+            f0 = np.nan_to_num(target)
        return f0

    def coarse_f0(self, f0):
@@ -93,7 +139,7 @@ class FeatureInput(object):
        )
        return f0_coarse

-    def go(self, paths, f0_method):
+    def go(self, paths, f0_method, crepe_hop_length):
        if len(paths) == 0:
            printt("no-f0-todo")
        else:
@@ -108,7 +154,7 @@ class FeatureInput(object):
                        and os.path.exists(opt_path2 + ".npy") == True
                    ):
                        continue
-                    featur_pit = self.compute_f0(inp_path, f0_method)
+                    featur_pit = self.compute_f0(inp_path, f0_method, crepe_hop_length)
                    np.save(
                        opt_path2,
                        featur_pit,
@@ -152,6 +198,7 @@ if __name__ == "__main__":
            args=(
                paths[i::n_p],
                f0method,
+                extraction_crepe_hop_length,
            ),
        )
        p.start()
--- a/infer-web.py
+++ b/infer-web.py
@@ -462,6 +462,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir):
            exp_dir,
            n_p,
            f0method,
+            extraction_crepe_hop_length,
        )
        print(cmd)
        p = Popen(cmd, shell=True, cwd=now_dir)  # , stdin=PIPE, stdout=PIPE,stderr=PIPE
@@ -1299,10 +1300,18 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
                            label=i18n(
                                "选择音高提取算法:输入歌声可用pm提速,高质量语音但CPU差可用dio提速,harvest质量更好但慢"
                            ),
-                            choices=["pm", "harvest", "dio"],
+                            choices=["pm", "harvest", "dio", "crepe"], # Fork feature: Crepe on f0 extraction for training.
                            value="harvest",
                            interactive=True,
                        )
+                        extraction_crepe_hop_length = gr.Slider(
+                            minimum=1,
+                            maximum=512,
+                            step=1,
+                            label=i18n("crepe_hop_length"),
+                            value=64,
+                            interactive=True
+                        )
                    but2 = gr.Button(i18n("特征提取"), variant="primary")
                    info2 = gr.Textbox(label=i18n("输出信息"), value="", max_lines=8)
                    but2.click(