Added an f0 inference overhaul. Added dio and crepe f0 algorithms to inference and a crepe_hop_length slider on the main GUI. I can confirm that crepe sounds much better than harvest, dio and pm and is more stable especially with using small hop-lengths.

2026-02-24 03:49:51 +01:00 · 2023-05-03 10:58:42 +10:00
parent 4408673b0f
commit 6fc8c84083
9 changed files with 138 additions and 27 deletions
--- a/i18n/en_US.json
+++ b/i18n/en_US.json
@@ -17,6 +17,7 @@
    "变调(整数, 半音数量, 升八度12降八度-12)": "transpose(integer, number of semitones, octave sharp 12 octave flat -12)",
    "输入待处理音频文件路径(默认是正确格式示例)": "Enter the path of the audio file to be processed (the default is the correct format example)",
    "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "Select the algorithm for pitch extraction. Use 'pm' to speed up for singing voices, or use 'harvest' for better low-pitched voices, but it is extremely slow.",
+    "crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
    "特征检索库文件路径": "Feature search database file path",
    "特征文件路径": "Feature file path",
    "检索特征占比": "Search feature ratio",
--- a/i18n/es_ES.json
+++ b/i18n/es_ES.json
@@ -17,6 +17,7 @@
    "变调(整数, 半音数量, 升八度12降八度-12)": "Cambio de tono (entero, número de semitonos, subir una octava +12 o bajar una octava -12)",
    "输入待处理音频文件路径(默认是正确格式示例)": "Ingrese la ruta del archivo del audio que se procesará (el formato predeterminado es el ejemplo correcto)",
    "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "Seleccione el algoritmo para la extracción de tono. Use 'pm' para acelerar las voces cantadas, o use 'harvest' para mejorar las voces bajas, pero es extremadamente lento.",
+    "crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
    "特征检索库文件路径": "Ruta del archivo de la base de datos de búsqueda de características",
    "特征文件路径": "Ruta del archivo de características",
    "检索特征占比": "Proporción de función de búsqueda",
--- a/i18n/ja_JP.json
+++ b/i18n/ja_JP.json
@@ -17,6 +17,7 @@
    "变调(整数, 半音数量, 升八度12降八度-12)": "ピッチ変更(整数、半音数、上下オクターブ12-12)",
    "输入待处理音频文件路径(默认是正确格式示例)": "処理対象音声ファイルのパスを入力してください(デフォルトは正しいフォーマットの例です)",
    "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "ピッチ抽出アルゴリズムを選択してください。歌声の場合は、pmを使用して速度を上げることができます。低音が重要な場合は、harvestを使用できますが、非常に遅くなります。",
+    "crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
    "特征检索库文件路径": "特徴量検索データベースのファイルパス",
    "特征文件路径": "特徴量ファイルのパス",
    "检索特征占比": "検索特徴率",
--- a/i18n/zh_CN.json
+++ b/i18n/zh_CN.json
@@ -17,6 +17,7 @@
    "变调(整数, 半音数量, 升八度12降八度-12)": "变调(整数, 半音数量, 升八度12降八度-12)",
    "输入待处理音频文件路径(默认是正确格式示例)": "输入待处理音频文件路径(默认是正确格式示例)",
    "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比",
+    "crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
    "特征检索库文件路径": "特征检索库文件路径",
    "特征文件路径": "特征文件路径",
    "检索特征占比": "检索特征占比",
--- a/i18n/zh_HK.json
+++ b/i18n/zh_HK.json
@@ -17,6 +17,7 @@
    "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
    "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑（預設是正確格式示例）",
    "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "選擇音高提取演算法，輸入歌聲可用 pm 提速，harvest 低音好但巨慢無比",
+    "crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
    "特征检索库文件路径": "特徵檢索庫檔案路徑",
    "特征文件路径": "特徵檔案路徑",
    "检索特征占比": "檢索特徵佔比",
--- a/i18n/zh_SG.json
+++ b/i18n/zh_SG.json
@@ -17,6 +17,7 @@
    "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
    "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑（預設是正確格式示例）",
    "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "選擇音高提取演算法，輸入歌聲可用 pm 提速，harvest 低音好但巨慢無比",
+    "crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
    "特征检索库文件路径": "特徵檢索庫檔案路徑",
    "特征文件路径": "特徵檔案路徑",
    "检索特征占比": "檢索特徵佔比",
--- a/i18n/zh_TW.json
+++ b/i18n/zh_TW.json
@@ -17,6 +17,7 @@
    "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
    "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑（預設是正確格式示例）",
    "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "選擇音高提取演算法，輸入歌聲可用 pm 提速，harvest 低音好但巨慢無比",
+    "crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
    "特征检索库文件路径": "特徵檢索庫檔案路徑",
    "特征文件路径": "特徵檔案路徑",
    "检索特征占比": "檢索特徵佔比",
--- a/infer-web.py
+++ b/infer-web.py
@@ -133,6 +133,7 @@ def vc_single(
    file_index,
    # file_big_npy,
    index_rate,
+    crepe_hop_length,
 ):  # spk_item, input_audio0, vc_transform0,f0_file,f0method0
    global tgt_sr, net_g, vc, hubert_model
    if input_audio is None:
@@ -167,6 +168,7 @@ def vc_single(
            # file_big_npy,
            index_rate,
            if_f0,
+            crepe_hop_length,
            f0_file=f0_file,
        )
        print(
@@ -1064,10 +1066,18 @@ with gr.Blocks() as app:
                        )
                        f0method0 = gr.Radio(
                            label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比"),
-                            choices=["pm", "harvest"],
+                            choices=["pm", "harvest", "dio", "crepe"], # Fork Feature. Add the crepe radio button for crepe INFERENCE
                            value="pm",
                            interactive=True,
                        )
+                        crepe_hop_length = gr.Slider(
+                            minimum=32,
+                            maximum=512,
+                            step=32,
+                            label=i18n("crepe_hop_length"),
+                            value=128,
+                            interactive=True
+                        )
                    with gr.Column():
                        file_index1 = gr.Textbox(
                            label=i18n("特征检索库文件路径"),
@@ -1102,6 +1112,7 @@ with gr.Blocks() as app:
                            file_index1,
                            # file_big_npy1,
                            index_rate1,
+                            crepe_hop_length
                        ],
                        [vc_output1, vc_output2],
                    )
--- a/vc_infer_pipeline.py
+++ b/vc_infer_pipeline.py
@@ -1,13 +1,14 @@
 import numpy as np, parselmouth, torch, pdb
 from time import time as ttime
 import torch.nn.functional as F
+import torchcrepe # Fork feature. Use the crepe f0 algorithm. New dependency (pip install torchcrepe)
 import scipy.signal as signal
 import pyworld, os, traceback, faiss
 from scipy import signal
+from torch import Tensor # Fork Feature. Used for pitch prediction for the torchcrepe f0 inference computation

 bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)

-
 class VC(object):
    def __init__(self, tgt_sr, config):
        self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
@@ -27,29 +28,40 @@ class VC(object):
        self.t_max = self.sr * self.x_max  # 免查询时长阈值
        self.device = config.device

-    def get_f0(self, x, p_len, f0_up_key, f0_method, inp_f0=None):
-        time_step = self.window / self.sr * 1000
-        f0_min = 50
-        f0_max = 1100
-        f0_mel_min = 1127 * np.log(1 + f0_min / 700)
-        f0_mel_max = 1127 * np.log(1 + f0_max / 700)
-        if f0_method == "pm":
-            f0 = (
-                parselmouth.Sound(x, self.sr)
-                .to_pitch_ac(
-                    time_step=time_step / 1000,
-                    voicing_threshold=0.6,
-                    pitch_floor=f0_min,
-                    pitch_ceiling=f0_max,
-                )
-                .selected_array["frequency"]
+    #region f0 Overhaul Region
+    # Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device)
+    def get_optimal_torch_device(index: int = 0) -> torch.device:
+        # Get cuda device
+        if torch.cuda.is_available():
+            return torch.device(f"cuda:{index % torch.cuda.device_count()}")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
+        # Else wise return the "cpu" as a torch device, 
+        return torch.device("cpu")
+
+    # Get the f0 via parselmouth computation
+    def get_f0_pm_computation(self, x, time_step, f0_min, f0_max, p_len):
+        f0 = (
+            parselmouth.Sound(x, self.sr)
+            .to_pitch_ac(
+                time_step=time_step / 1000,
+                voicing_threshold=0.6,
+                pitch_floor=f0_min,
+                pitch_ceiling=f0_max,
            )
-            pad_size = (p_len - len(f0) + 1) // 2
-            if pad_size > 0 or p_len - len(f0) - pad_size > 0:
-                f0 = np.pad(
-                    f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
-                )
-        elif f0_method == "harvest":
+            .selected_array["frequency"]
+        )
+        pad_size = (p_len - len(f0) + 1) // 2
+        if pad_size > 0 or p_len - len(f0) - pad_size > 0:
+            f0 = np.pad(
+                f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
+            )
+        return f0
+
+    # Get the f0 via the pyworld computation. Fork Feature +dio along with harvest
+    def get_f0_pyworld_computation(self, x, f0_min, f0_max, f0_type):
+        if f0_type == "harvest":
            f0, t = pyworld.harvest(
                x.astype(np.double),
                fs=self.sr,
@@ -57,8 +69,87 @@ class VC(object):
                f0_floor=f0_min,
                frame_period=10,
            )
-            f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
-            f0 = signal.medfilt(f0, 3)
+        elif f0_type == "dio":
+            f0, t = pyworld.dio(
+                x.astype(np.double),
+                fs=self.sr,
+                f0_ceil=f0_max,
+                f0_floor=f0_min,
+                frame_period=10,
+            )
+        f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
+        f0 = signal.medfilt(f0, 3) 
+        return f0
+
+    # Fork Feature: Resize f0 for f0 retrieved from torchcrepe Tensor prediction
+    def resize_f0(self, x, target_len):
+        source = np.array(x)
+        source[source < 0.001] = np.nan
+        target = np.interp(
+            np.arange(0, len(source) * target_len, len(source)) / target_len,
+            np.arange(0, len(source)),
+            source,
+        )
+        resized = np.nan_to_num(target)
+        return resized
+
+    # Fork Feature: Get the f0 via the crepe algorithm from torchcrepe
+    def get_f0_crepe_computation(
+            self, 
+            x, 
+            f0_min,
+            f0_max,
+            p_len,
+            hop_length=128, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
+            model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
+    ):
+        x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float.
+        x /= np.quantile(np.abs(x), 0.999)
+        torch_device = self.get_optimal_torch_device()
+        audio = torch.from_numpy(x).to(torch_device, copy=True)
+        audio = torch.unsqueeze(audio, dim=0)
+
+        if audio.ndim == 2 and audio.shape[0] > 1:
+            audio = torch.mean(audio, dim=0, keepdim=True).detach()
+        audio = audio.detach()
+
+        print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
+        pitch: Tensor = torchcrepe.predict(
+            audio,
+            self.sr,
+            hop_length,
+            f0_min,
+            f0_max,
+            model,
+            batch_size=hop_length * 2, 
+            device=torch_device,
+            pad=True
+        )
+
+        f0 = pitch.squeeze(0).cpu().float().numpy()
+        p_len = p_len or x.shape[0] // hop_length
+        f0 = self.resize_f0(f0, p_len)
+        return f0
+    
+    #endregion
+
+    def get_f0(self, x, p_len, f0_up_key, f0_method, crepe_hop_length, inp_f0=None):
+        time_step = self.window / self.sr * 1000
+        f0_min = 50
+        f0_max = 1100
+        f0_mel_min = 1127 * np.log(1 + f0_min / 700)
+        f0_mel_max = 1127 * np.log(1 + f0_max / 700)
+        if f0_method == "pm":
+            f0 = self.get_f0_pm_computation(x, time_step, f0_min, f0_max, p_len)
+        elif f0_method == "harvest":
+            f0 = self.get_f0_pyworld_computation(x, f0_min, f0_max, "harvest")
+        elif f0_method == "dio": # Fork Feature
+            f0 = self.get_f0_pyworld_computation(x, f0_min, f0_max, "dio")
+        elif f0_method == "crepe": # Fork Feature: Adding a new f0 algorithm called crepe
+            f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length)
+        # Add crepe-tiny method here
+
+        print("Using the following f0 method: " + f0_method)
        f0 *= pow(2, f0_up_key / 12)
        # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
        tf0 = self.sr // self.window  # 每秒f0点数
@@ -82,6 +173,7 @@ class VC(object):
        f0_mel[f0_mel <= 1] = 1
        f0_mel[f0_mel > 255] = 255
        f0_coarse = np.rint(f0_mel).astype(np.int)
+
        return f0_coarse, f0bak  # 1-0

    def vc(
@@ -189,6 +281,7 @@ class VC(object):
        # file_big_npy,
        index_rate,
        if_f0,
+        crepe_hop_length,
        f0_file=None,
    ):
        if (
@@ -243,7 +336,7 @@ class VC(object):
        sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
        pitch, pitchf = None, None
        if if_f0 == 1:
-            pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0)
+            pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, crepe_hop_length, inp_f0)
            pitch = pitch[:p_len]
            pitchf = pitchf[:p_len]
            pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()