mirror of
https://github.com/Mangio621/Mangio-RVC-Fork.git
synced 2026-02-24 03:49:51 +01:00
Added an f0 inference overhaul. Added dio and crepe f0 algorithms to inference and a crepe_hop_length slider on the main GUI. I can confirm that crepe sounds much better than harvest, dio and pm and is more stable especially with using small hop-lengths.
This commit is contained in:
@@ -17,6 +17,7 @@
|
||||
"变调(整数, 半音数量, 升八度12降八度-12)": "transpose(integer, number of semitones, octave sharp 12 octave flat -12)",
|
||||
"输入待处理音频文件路径(默认是正确格式示例)": "Enter the path of the audio file to be processed (the default is the correct format example)",
|
||||
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "Select the algorithm for pitch extraction. Use 'pm' to speed up for singing voices, or use 'harvest' for better low-pitched voices, but it is extremely slow.",
|
||||
"crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
|
||||
"特征检索库文件路径": "Feature search database file path",
|
||||
"特征文件路径": "Feature file path",
|
||||
"检索特征占比": "Search feature ratio",
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
"变调(整数, 半音数量, 升八度12降八度-12)": "Cambio de tono (entero, número de semitonos, subir una octava +12 o bajar una octava -12)",
|
||||
"输入待处理音频文件路径(默认是正确格式示例)": "Ingrese la ruta del archivo del audio que se procesará (el formato predeterminado es el ejemplo correcto)",
|
||||
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "Seleccione el algoritmo para la extracción de tono. Use 'pm' para acelerar las voces cantadas, o use 'harvest' para mejorar las voces bajas, pero es extremadamente lento.",
|
||||
"crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
|
||||
"特征检索库文件路径": "Ruta del archivo de la base de datos de búsqueda de características",
|
||||
"特征文件路径": "Ruta del archivo de características",
|
||||
"检索特征占比": "Proporción de función de búsqueda",
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
"变调(整数, 半音数量, 升八度12降八度-12)": "ピッチ変更(整数、半音数、上下オクターブ12-12)",
|
||||
"输入待处理音频文件路径(默认是正确格式示例)": "処理対象音声ファイルのパスを入力してください(デフォルトは正しいフォーマットの例です)",
|
||||
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "ピッチ抽出アルゴリズムを選択してください。歌声の場合は、pmを使用して速度を上げることができます。低音が重要な場合は、harvestを使用できますが、非常に遅くなります。",
|
||||
"crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
|
||||
"特征检索库文件路径": "特徴量検索データベースのファイルパス",
|
||||
"特征文件路径": "特徴量ファイルのパス",
|
||||
"检索特征占比": "検索特徴率",
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
"变调(整数, 半音数量, 升八度12降八度-12)": "变调(整数, 半音数量, 升八度12降八度-12)",
|
||||
"输入待处理音频文件路径(默认是正确格式示例)": "输入待处理音频文件路径(默认是正确格式示例)",
|
||||
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比",
|
||||
"crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
|
||||
"特征检索库文件路径": "特征检索库文件路径",
|
||||
"特征文件路径": "特征文件路径",
|
||||
"检索特征占比": "检索特征占比",
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
"变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
|
||||
"输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)",
|
||||
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "選擇音高提取演算法,輸入歌聲可用 pm 提速,harvest 低音好但巨慢無比",
|
||||
"crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
|
||||
"特征检索库文件路径": "特徵檢索庫檔案路徑",
|
||||
"特征文件路径": "特徵檔案路徑",
|
||||
"检索特征占比": "檢索特徵佔比",
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
"变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
|
||||
"输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)",
|
||||
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "選擇音高提取演算法,輸入歌聲可用 pm 提速,harvest 低音好但巨慢無比",
|
||||
"crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
|
||||
"特征检索库文件路径": "特徵檢索庫檔案路徑",
|
||||
"特征文件路径": "特徵檔案路徑",
|
||||
"检索特征占比": "檢索特徵佔比",
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
"变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
|
||||
"输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)",
|
||||
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "選擇音高提取演算法,輸入歌聲可用 pm 提速,harvest 低音好但巨慢無比",
|
||||
"crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
|
||||
"特征检索库文件路径": "特徵檢索庫檔案路徑",
|
||||
"特征文件路径": "特徵檔案路徑",
|
||||
"检索特征占比": "檢索特徵佔比",
|
||||
|
||||
13
infer-web.py
13
infer-web.py
@@ -133,6 +133,7 @@ def vc_single(
|
||||
file_index,
|
||||
# file_big_npy,
|
||||
index_rate,
|
||||
crepe_hop_length,
|
||||
): # spk_item, input_audio0, vc_transform0,f0_file,f0method0
|
||||
global tgt_sr, net_g, vc, hubert_model
|
||||
if input_audio is None:
|
||||
@@ -167,6 +168,7 @@ def vc_single(
|
||||
# file_big_npy,
|
||||
index_rate,
|
||||
if_f0,
|
||||
crepe_hop_length,
|
||||
f0_file=f0_file,
|
||||
)
|
||||
print(
|
||||
@@ -1064,10 +1066,18 @@ with gr.Blocks() as app:
|
||||
)
|
||||
f0method0 = gr.Radio(
|
||||
label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比"),
|
||||
choices=["pm", "harvest"],
|
||||
choices=["pm", "harvest", "dio", "crepe"], # Fork Feature. Add the crepe radio button for crepe INFERENCE
|
||||
value="pm",
|
||||
interactive=True,
|
||||
)
|
||||
crepe_hop_length = gr.Slider(
|
||||
minimum=32,
|
||||
maximum=512,
|
||||
step=32,
|
||||
label=i18n("crepe_hop_length"),
|
||||
value=128,
|
||||
interactive=True
|
||||
)
|
||||
with gr.Column():
|
||||
file_index1 = gr.Textbox(
|
||||
label=i18n("特征检索库文件路径"),
|
||||
@@ -1102,6 +1112,7 @@ with gr.Blocks() as app:
|
||||
file_index1,
|
||||
# file_big_npy1,
|
||||
index_rate1,
|
||||
crepe_hop_length
|
||||
],
|
||||
[vc_output1, vc_output2],
|
||||
)
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
import numpy as np, parselmouth, torch, pdb
|
||||
from time import time as ttime
|
||||
import torch.nn.functional as F
|
||||
import torchcrepe # Fork feature. Use the crepe f0 algorithm. New dependency (pip install torchcrepe)
|
||||
import scipy.signal as signal
|
||||
import pyworld, os, traceback, faiss
|
||||
from scipy import signal
|
||||
from torch import Tensor # Fork Feature. Used for pitch prediction for the torchcrepe f0 inference computation
|
||||
|
||||
bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
|
||||
|
||||
|
||||
class VC(object):
|
||||
def __init__(self, tgt_sr, config):
|
||||
self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
|
||||
@@ -27,29 +28,40 @@ class VC(object):
|
||||
self.t_max = self.sr * self.x_max # 免查询时长阈值
|
||||
self.device = config.device
|
||||
|
||||
def get_f0(self, x, p_len, f0_up_key, f0_method, inp_f0=None):
|
||||
time_step = self.window / self.sr * 1000
|
||||
f0_min = 50
|
||||
f0_max = 1100
|
||||
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
||||
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
||||
if f0_method == "pm":
|
||||
f0 = (
|
||||
parselmouth.Sound(x, self.sr)
|
||||
.to_pitch_ac(
|
||||
time_step=time_step / 1000,
|
||||
voicing_threshold=0.6,
|
||||
pitch_floor=f0_min,
|
||||
pitch_ceiling=f0_max,
|
||||
)
|
||||
.selected_array["frequency"]
|
||||
#region f0 Overhaul Region
|
||||
# Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device)
|
||||
def get_optimal_torch_device(index: int = 0) -> torch.device:
|
||||
# Get cuda device
|
||||
if torch.cuda.is_available():
|
||||
return torch.device(f"cuda:{index % torch.cuda.device_count()}")
|
||||
elif torch.backends.mps.is_available():
|
||||
return torch.device("mps")
|
||||
# Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
|
||||
# Else wise return the "cpu" as a torch device,
|
||||
return torch.device("cpu")
|
||||
|
||||
# Get the f0 via parselmouth computation
|
||||
def get_f0_pm_computation(self, x, time_step, f0_min, f0_max, p_len):
|
||||
f0 = (
|
||||
parselmouth.Sound(x, self.sr)
|
||||
.to_pitch_ac(
|
||||
time_step=time_step / 1000,
|
||||
voicing_threshold=0.6,
|
||||
pitch_floor=f0_min,
|
||||
pitch_ceiling=f0_max,
|
||||
)
|
||||
pad_size = (p_len - len(f0) + 1) // 2
|
||||
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
|
||||
f0 = np.pad(
|
||||
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
|
||||
)
|
||||
elif f0_method == "harvest":
|
||||
.selected_array["frequency"]
|
||||
)
|
||||
pad_size = (p_len - len(f0) + 1) // 2
|
||||
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
|
||||
f0 = np.pad(
|
||||
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
|
||||
)
|
||||
return f0
|
||||
|
||||
# Get the f0 via the pyworld computation. Fork Feature +dio along with harvest
|
||||
def get_f0_pyworld_computation(self, x, f0_min, f0_max, f0_type):
|
||||
if f0_type == "harvest":
|
||||
f0, t = pyworld.harvest(
|
||||
x.astype(np.double),
|
||||
fs=self.sr,
|
||||
@@ -57,8 +69,87 @@ class VC(object):
|
||||
f0_floor=f0_min,
|
||||
frame_period=10,
|
||||
)
|
||||
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
|
||||
f0 = signal.medfilt(f0, 3)
|
||||
elif f0_type == "dio":
|
||||
f0, t = pyworld.dio(
|
||||
x.astype(np.double),
|
||||
fs=self.sr,
|
||||
f0_ceil=f0_max,
|
||||
f0_floor=f0_min,
|
||||
frame_period=10,
|
||||
)
|
||||
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
|
||||
f0 = signal.medfilt(f0, 3)
|
||||
return f0
|
||||
|
||||
# Fork Feature: Resize f0 for f0 retrieved from torchcrepe Tensor prediction
|
||||
def resize_f0(self, x, target_len):
|
||||
source = np.array(x)
|
||||
source[source < 0.001] = np.nan
|
||||
target = np.interp(
|
||||
np.arange(0, len(source) * target_len, len(source)) / target_len,
|
||||
np.arange(0, len(source)),
|
||||
source,
|
||||
)
|
||||
resized = np.nan_to_num(target)
|
||||
return resized
|
||||
|
||||
# Fork Feature: Get the f0 via the crepe algorithm from torchcrepe
|
||||
def get_f0_crepe_computation(
|
||||
self,
|
||||
x,
|
||||
f0_min,
|
||||
f0_max,
|
||||
p_len,
|
||||
hop_length=128, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
|
||||
model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
|
||||
):
|
||||
x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float.
|
||||
x /= np.quantile(np.abs(x), 0.999)
|
||||
torch_device = self.get_optimal_torch_device()
|
||||
audio = torch.from_numpy(x).to(torch_device, copy=True)
|
||||
audio = torch.unsqueeze(audio, dim=0)
|
||||
|
||||
if audio.ndim == 2 and audio.shape[0] > 1:
|
||||
audio = torch.mean(audio, dim=0, keepdim=True).detach()
|
||||
audio = audio.detach()
|
||||
|
||||
print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
|
||||
pitch: Tensor = torchcrepe.predict(
|
||||
audio,
|
||||
self.sr,
|
||||
hop_length,
|
||||
f0_min,
|
||||
f0_max,
|
||||
model,
|
||||
batch_size=hop_length * 2,
|
||||
device=torch_device,
|
||||
pad=True
|
||||
)
|
||||
|
||||
f0 = pitch.squeeze(0).cpu().float().numpy()
|
||||
p_len = p_len or x.shape[0] // hop_length
|
||||
f0 = self.resize_f0(f0, p_len)
|
||||
return f0
|
||||
|
||||
#endregion
|
||||
|
||||
def get_f0(self, x, p_len, f0_up_key, f0_method, crepe_hop_length, inp_f0=None):
|
||||
time_step = self.window / self.sr * 1000
|
||||
f0_min = 50
|
||||
f0_max = 1100
|
||||
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
||||
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
||||
if f0_method == "pm":
|
||||
f0 = self.get_f0_pm_computation(x, time_step, f0_min, f0_max, p_len)
|
||||
elif f0_method == "harvest":
|
||||
f0 = self.get_f0_pyworld_computation(x, f0_min, f0_max, "harvest")
|
||||
elif f0_method == "dio": # Fork Feature
|
||||
f0 = self.get_f0_pyworld_computation(x, f0_min, f0_max, "dio")
|
||||
elif f0_method == "crepe": # Fork Feature: Adding a new f0 algorithm called crepe
|
||||
f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length)
|
||||
# Add crepe-tiny method here
|
||||
|
||||
print("Using the following f0 method: " + f0_method)
|
||||
f0 *= pow(2, f0_up_key / 12)
|
||||
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
|
||||
tf0 = self.sr // self.window # 每秒f0点数
|
||||
@@ -82,6 +173,7 @@ class VC(object):
|
||||
f0_mel[f0_mel <= 1] = 1
|
||||
f0_mel[f0_mel > 255] = 255
|
||||
f0_coarse = np.rint(f0_mel).astype(np.int)
|
||||
|
||||
return f0_coarse, f0bak # 1-0
|
||||
|
||||
def vc(
|
||||
@@ -189,6 +281,7 @@ class VC(object):
|
||||
# file_big_npy,
|
||||
index_rate,
|
||||
if_f0,
|
||||
crepe_hop_length,
|
||||
f0_file=None,
|
||||
):
|
||||
if (
|
||||
@@ -243,7 +336,7 @@ class VC(object):
|
||||
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
||||
pitch, pitchf = None, None
|
||||
if if_f0 == 1:
|
||||
pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0)
|
||||
pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, crepe_hop_length, inp_f0)
|
||||
pitch = pitch[:p_len]
|
||||
pitchf = pitchf[:p_len]
|
||||
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
||||
|
||||
Reference in New Issue
Block a user