Added an f0 inference overhaul. Added dio and crepe f0 algorithms to inference and a crepe_hop_length slider on the main GUI. I can confirm that crepe sounds much better than harvest, dio and pm and is more stable especially with using small hop-lengths.

This commit is contained in:
Mangio621
2023-05-03 10:58:42 +10:00
parent 4408673b0f
commit 6fc8c84083
9 changed files with 138 additions and 27 deletions

View File

@@ -17,6 +17,7 @@
"变调(整数, 半音数量, 升八度12降八度-12)": "transpose(integer, number of semitones, octave sharp 12 octave flat -12)",
"输入待处理音频文件路径(默认是正确格式示例)": "Enter the path of the audio file to be processed (the default is the correct format example)",
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "Select the algorithm for pitch extraction. Use 'pm' to speed up for singing voices, or use 'harvest' for better low-pitched voices, but it is extremely slow.",
"crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
"特征检索库文件路径": "Feature search database file path",
"特征文件路径": "Feature file path",
"检索特征占比": "Search feature ratio",

View File

@@ -17,6 +17,7 @@
"变调(整数, 半音数量, 升八度12降八度-12)": "Cambio de tono (entero, número de semitonos, subir una octava +12 o bajar una octava -12)",
"输入待处理音频文件路径(默认是正确格式示例)": "Ingrese la ruta del archivo del audio que se procesará (el formato predeterminado es el ejemplo correcto)",
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "Seleccione el algoritmo para la extracción de tono. Use 'pm' para acelerar las voces cantadas, o use 'harvest' para mejorar las voces bajas, pero es extremadamente lento.",
"crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
"特征检索库文件路径": "Ruta del archivo de la base de datos de búsqueda de características",
"特征文件路径": "Ruta del archivo de características",
"检索特征占比": "Proporción de función de búsqueda",

View File

@@ -17,6 +17,7 @@
"变调(整数, 半音数量, 升八度12降八度-12)": "ピッチ変更(整数、半音数、上下オクターブ12-12)",
"输入待处理音频文件路径(默认是正确格式示例)": "処理対象音声ファイルのパスを入力してください(デフォルトは正しいフォーマットの例です)",
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "ピッチ抽出アルゴリズムを選択してください。歌声の場合は、pmを使用して速度を上げることができます。低音が重要な場合は、harvestを使用できますが、非常に遅くなります。",
"crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
"特征检索库文件路径": "特徴量検索データベースのファイルパス",
"特征文件路径": "特徴量ファイルのパス",
"检索特征占比": "検索特徴率",

View File

@@ -17,6 +17,7 @@
"变调(整数, 半音数量, 升八度12降八度-12)": "变调(整数, 半音数量, 升八度12降八度-12)",
"输入待处理音频文件路径(默认是正确格式示例)": "输入待处理音频文件路径(默认是正确格式示例)",
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比",
"crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
"特征检索库文件路径": "特征检索库文件路径",
"特征文件路径": "特征文件路径",
"检索特征占比": "检索特征占比",

View File

@@ -17,6 +17,7 @@
"变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
"输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)",
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "選擇音高提取演算法,輸入歌聲可用 pm 提速harvest 低音好但巨慢無比",
"crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
"特征检索库文件路径": "特徵檢索庫檔案路徑",
"特征文件路径": "特徵檔案路徑",
"检索特征占比": "檢索特徵佔比",

View File

@@ -17,6 +17,7 @@
"变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
"输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)",
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "選擇音高提取演算法,輸入歌聲可用 pm 提速harvest 低音好但巨慢無比",
"crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
"特征检索库文件路径": "特徵檢索庫檔案路徑",
"特征文件路径": "特徵檔案路徑",
"检索特征占比": "檢索特徵佔比",

View File

@@ -17,6 +17,7 @@
"变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)",
"输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)",
"选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "選擇音高提取演算法,輸入歌聲可用 pm 提速harvest 低音好但巨慢無比",
"crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.",
"特征检索库文件路径": "特徵檢索庫檔案路徑",
"特征文件路径": "特徵檔案路徑",
"检索特征占比": "檢索特徵佔比",

View File

@@ -133,6 +133,7 @@ def vc_single(
file_index,
# file_big_npy,
index_rate,
crepe_hop_length,
): # spk_item, input_audio0, vc_transform0,f0_file,f0method0
global tgt_sr, net_g, vc, hubert_model
if input_audio is None:
@@ -167,6 +168,7 @@ def vc_single(
# file_big_npy,
index_rate,
if_f0,
crepe_hop_length,
f0_file=f0_file,
)
print(
@@ -1064,10 +1066,18 @@ with gr.Blocks() as app:
)
f0method0 = gr.Radio(
label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比"),
choices=["pm", "harvest"],
choices=["pm", "harvest", "dio", "crepe"], # Fork Feature. Add the crepe radio button for crepe INFERENCE
value="pm",
interactive=True,
)
crepe_hop_length = gr.Slider(
minimum=32,
maximum=512,
step=32,
label=i18n("crepe_hop_length"),
value=128,
interactive=True
)
with gr.Column():
file_index1 = gr.Textbox(
label=i18n("特征检索库文件路径"),
@@ -1102,6 +1112,7 @@ with gr.Blocks() as app:
file_index1,
# file_big_npy1,
index_rate1,
crepe_hop_length
],
[vc_output1, vc_output2],
)

View File

@@ -1,13 +1,14 @@
import numpy as np, parselmouth, torch, pdb
from time import time as ttime
import torch.nn.functional as F
import torchcrepe # Fork feature. Use the crepe f0 algorithm. New dependency (pip install torchcrepe)
import scipy.signal as signal
import pyworld, os, traceback, faiss
from scipy import signal
from torch import Tensor # Fork Feature. Used for pitch prediction for the torchcrepe f0 inference computation
bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
class VC(object):
def __init__(self, tgt_sr, config):
self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
@@ -27,29 +28,40 @@ class VC(object):
self.t_max = self.sr * self.x_max # 免查询时长阈值
self.device = config.device
def get_f0(self, x, p_len, f0_up_key, f0_method, inp_f0=None):
time_step = self.window / self.sr * 1000
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
if f0_method == "pm":
f0 = (
parselmouth.Sound(x, self.sr)
.to_pitch_ac(
time_step=time_step / 1000,
voicing_threshold=0.6,
pitch_floor=f0_min,
pitch_ceiling=f0_max,
)
.selected_array["frequency"]
#region f0 Overhaul Region
# Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device)
def get_optimal_torch_device(index: int = 0) -> torch.device:
# Get cuda device
if torch.cuda.is_available():
return torch.device(f"cuda:{index % torch.cuda.device_count()}")
elif torch.backends.mps.is_available():
return torch.device("mps")
# Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
# Else wise return the "cpu" as a torch device,
return torch.device("cpu")
# Get the f0 via parselmouth computation
def get_f0_pm_computation(self, x, time_step, f0_min, f0_max, p_len):
f0 = (
parselmouth.Sound(x, self.sr)
.to_pitch_ac(
time_step=time_step / 1000,
voicing_threshold=0.6,
pitch_floor=f0_min,
pitch_ceiling=f0_max,
)
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
)
elif f0_method == "harvest":
.selected_array["frequency"]
)
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
)
return f0
# Get the f0 via the pyworld computation. Fork Feature +dio along with harvest
def get_f0_pyworld_computation(self, x, f0_min, f0_max, f0_type):
if f0_type == "harvest":
f0, t = pyworld.harvest(
x.astype(np.double),
fs=self.sr,
@@ -57,8 +69,87 @@ class VC(object):
f0_floor=f0_min,
frame_period=10,
)
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
f0 = signal.medfilt(f0, 3)
elif f0_type == "dio":
f0, t = pyworld.dio(
x.astype(np.double),
fs=self.sr,
f0_ceil=f0_max,
f0_floor=f0_min,
frame_period=10,
)
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
f0 = signal.medfilt(f0, 3)
return f0
# Fork Feature: Resize f0 for f0 retrieved from torchcrepe Tensor prediction
def resize_f0(self, x, target_len):
source = np.array(x)
source[source < 0.001] = np.nan
target = np.interp(
np.arange(0, len(source) * target_len, len(source)) / target_len,
np.arange(0, len(source)),
source,
)
resized = np.nan_to_num(target)
return resized
# Fork Feature: Get the f0 via the crepe algorithm from torchcrepe
def get_f0_crepe_computation(
self,
x,
f0_min,
f0_max,
p_len,
hop_length=128, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
):
x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float.
x /= np.quantile(np.abs(x), 0.999)
torch_device = self.get_optimal_torch_device()
audio = torch.from_numpy(x).to(torch_device, copy=True)
audio = torch.unsqueeze(audio, dim=0)
if audio.ndim == 2 and audio.shape[0] > 1:
audio = torch.mean(audio, dim=0, keepdim=True).detach()
audio = audio.detach()
print("Initiating prediction with a crepe_hop_length of: " + str(hop_length))
pitch: Tensor = torchcrepe.predict(
audio,
self.sr,
hop_length,
f0_min,
f0_max,
model,
batch_size=hop_length * 2,
device=torch_device,
pad=True
)
f0 = pitch.squeeze(0).cpu().float().numpy()
p_len = p_len or x.shape[0] // hop_length
f0 = self.resize_f0(f0, p_len)
return f0
#endregion
def get_f0(self, x, p_len, f0_up_key, f0_method, crepe_hop_length, inp_f0=None):
time_step = self.window / self.sr * 1000
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
if f0_method == "pm":
f0 = self.get_f0_pm_computation(x, time_step, f0_min, f0_max, p_len)
elif f0_method == "harvest":
f0 = self.get_f0_pyworld_computation(x, f0_min, f0_max, "harvest")
elif f0_method == "dio": # Fork Feature
f0 = self.get_f0_pyworld_computation(x, f0_min, f0_max, "dio")
elif f0_method == "crepe": # Fork Feature: Adding a new f0 algorithm called crepe
f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length)
# Add crepe-tiny method here
print("Using the following f0 method: " + f0_method)
f0 *= pow(2, f0_up_key / 12)
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
tf0 = self.sr // self.window # 每秒f0点数
@@ -82,6 +173,7 @@ class VC(object):
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
f0_coarse = np.rint(f0_mel).astype(np.int)
return f0_coarse, f0bak # 1-0
def vc(
@@ -189,6 +281,7 @@ class VC(object):
# file_big_npy,
index_rate,
if_f0,
crepe_hop_length,
f0_file=None,
):
if (
@@ -243,7 +336,7 @@ class VC(object):
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
pitch, pitchf = None, None
if if_f0 == 1:
pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0)
pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, crepe_hop_length, inp_f0)
pitch = pitch[:p_len]
pitchf = pitchf[:p_len]
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()