diff --git a/i18n/en_US.json b/i18n/en_US.json index faab26c..71ff7c0 100644 --- a/i18n/en_US.json +++ b/i18n/en_US.json @@ -17,6 +17,7 @@ "变调(整数, 半音数量, 升八度12降八度-12)": "transpose(integer, number of semitones, octave sharp 12 octave flat -12)", "输入待处理音频文件路径(默认是正确格式示例)": "Enter the path of the audio file to be processed (the default is the correct format example)", "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "Select the algorithm for pitch extraction. Use 'pm' to speed up for singing voices, or use 'harvest' for better low-pitched voices, but it is extremely slow.", + "crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.", "特征检索库文件路径": "Feature search database file path", "特征文件路径": "Feature file path", "检索特征占比": "Search feature ratio", diff --git a/i18n/es_ES.json b/i18n/es_ES.json index 363d4e4..53a14ee 100644 --- a/i18n/es_ES.json +++ b/i18n/es_ES.json @@ -17,6 +17,7 @@ "变调(整数, 半音数量, 升八度12降八度-12)": "Cambio de tono (entero, número de semitonos, subir una octava +12 o bajar una octava -12)", "输入待处理音频文件路径(默认是正确格式示例)": "Ingrese la ruta del archivo del audio que se procesará (el formato predeterminado es el ejemplo correcto)", "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "Seleccione el algoritmo para la extracción de tono. Use 'pm' para acelerar las voces cantadas, o use 'harvest' para mejorar las voces bajas, pero es extremadamente lento.", + "crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.", "特征检索库文件路径": "Ruta del archivo de la base de datos de búsqueda de características", "特征文件路径": "Ruta del archivo de características", "检索特征占比": "Proporción de función de búsqueda", diff --git a/i18n/ja_JP.json b/i18n/ja_JP.json index 5261b35..40fb80f 100644 --- a/i18n/ja_JP.json +++ b/i18n/ja_JP.json @@ -17,6 +17,7 @@ "变调(整数, 半音数量, 升八度12降八度-12)": "ピッチ変更(整数、半音数、上下オクターブ12-12)", "输入待处理音频文件路径(默认是正确格式示例)": "処理対象音声ファイルのパスを入力してください(デフォルトは正しいフォーマットの例です)", "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "ピッチ抽出アルゴリズムを選択してください。歌声の場合は、pmを使用して速度を上げることができます。低音が重要な場合は、harvestを使用できますが、非常に遅くなります。", + "crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.", "特征检索库文件路径": "特徴量検索データベースのファイルパス", "特征文件路径": "特徴量ファイルのパス", "检索特征占比": "検索特徴率", diff --git a/i18n/zh_CN.json b/i18n/zh_CN.json index bd5d5bf..0691b2d 100644 --- a/i18n/zh_CN.json +++ b/i18n/zh_CN.json @@ -17,6 +17,7 @@ "变调(整数, 半音数量, 升八度12降八度-12)": "变调(整数, 半音数量, 升八度12降八度-12)", "输入待处理音频文件路径(默认是正确格式示例)": "输入待处理音频文件路径(默认是正确格式示例)", "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比", + "crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.", "特征检索库文件路径": "特征检索库文件路径", "特征文件路径": "特征文件路径", "检索特征占比": "检索特征占比", diff --git a/i18n/zh_HK.json b/i18n/zh_HK.json index ac3616e..25c6212 100644 --- a/i18n/zh_HK.json +++ b/i18n/zh_HK.json @@ -17,6 +17,7 @@ "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)", "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "選擇音高提取演算法,輸入歌聲可用 pm 提速,harvest 低音好但巨慢無比", + "crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.", "特征检索库文件路径": "特徵檢索庫檔案路徑", "特征文件路径": "特徵檔案路徑", "检索特征占比": "檢索特徵佔比", diff --git a/i18n/zh_SG.json b/i18n/zh_SG.json index ac3616e..25c6212 100644 --- a/i18n/zh_SG.json +++ b/i18n/zh_SG.json @@ -17,6 +17,7 @@ "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)", "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "選擇音高提取演算法,輸入歌聲可用 pm 提速,harvest 低音好但巨慢無比", + "crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.", "特征检索库文件路径": "特徵檢索庫檔案路徑", "特征文件路径": "特徵檔案路徑", "检索特征占比": "檢索特徵佔比", diff --git a/i18n/zh_TW.json b/i18n/zh_TW.json index ac3616e..25c6212 100644 --- a/i18n/zh_TW.json +++ b/i18n/zh_TW.json @@ -17,6 +17,7 @@ "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)", "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比": "選擇音高提取演算法,輸入歌聲可用 pm 提速,harvest 低音好但巨慢無比", + "crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.", "特征检索库文件路径": "特徵檢索庫檔案路徑", "特征文件路径": "特徵檔案路徑", "检索特征占比": "檢索特徵佔比", diff --git a/infer-web.py b/infer-web.py index 394a4c9..790ed42 100644 --- a/infer-web.py +++ b/infer-web.py @@ -133,6 +133,7 @@ def vc_single( file_index, # file_big_npy, index_rate, + crepe_hop_length, ): # spk_item, input_audio0, vc_transform0,f0_file,f0method0 global tgt_sr, net_g, vc, hubert_model if input_audio is None: @@ -167,6 +168,7 @@ def vc_single( # file_big_npy, index_rate, if_f0, + crepe_hop_length, f0_file=f0_file, ) print( @@ -1064,10 +1066,18 @@ with gr.Blocks() as app: ) f0method0 = gr.Radio( label=i18n("选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比"), - choices=["pm", "harvest"], + choices=["pm", "harvest", "dio", "crepe"], # Fork Feature. Add the crepe radio button for crepe INFERENCE value="pm", interactive=True, ) + crepe_hop_length = gr.Slider( + minimum=32, + maximum=512, + step=32, + label=i18n("crepe_hop_length"), + value=128, + interactive=True + ) with gr.Column(): file_index1 = gr.Textbox( label=i18n("特征检索库文件路径"), @@ -1102,6 +1112,7 @@ with gr.Blocks() as app: file_index1, # file_big_npy1, index_rate1, + crepe_hop_length ], [vc_output1, vc_output2], ) diff --git a/vc_infer_pipeline.py b/vc_infer_pipeline.py index 2b8e099..dd9756d 100644 --- a/vc_infer_pipeline.py +++ b/vc_infer_pipeline.py @@ -1,13 +1,14 @@ import numpy as np, parselmouth, torch, pdb from time import time as ttime import torch.nn.functional as F +import torchcrepe # Fork feature. Use the crepe f0 algorithm. New dependency (pip install torchcrepe) import scipy.signal as signal import pyworld, os, traceback, faiss from scipy import signal +from torch import Tensor # Fork Feature. Used for pitch prediction for the torchcrepe f0 inference computation bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) - class VC(object): def __init__(self, tgt_sr, config): self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = ( @@ -27,29 +28,40 @@ class VC(object): self.t_max = self.sr * self.x_max # 免查询时长阈值 self.device = config.device - def get_f0(self, x, p_len, f0_up_key, f0_method, inp_f0=None): - time_step = self.window / self.sr * 1000 - f0_min = 50 - f0_max = 1100 - f0_mel_min = 1127 * np.log(1 + f0_min / 700) - f0_mel_max = 1127 * np.log(1 + f0_max / 700) - if f0_method == "pm": - f0 = ( - parselmouth.Sound(x, self.sr) - .to_pitch_ac( - time_step=time_step / 1000, - voicing_threshold=0.6, - pitch_floor=f0_min, - pitch_ceiling=f0_max, - ) - .selected_array["frequency"] + #region f0 Overhaul Region + # Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device) + def get_optimal_torch_device(index: int = 0) -> torch.device: + # Get cuda device + if torch.cuda.is_available(): + return torch.device(f"cuda:{index % torch.cuda.device_count()}") + elif torch.backends.mps.is_available(): + return torch.device("mps") + # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library + # Else wise return the "cpu" as a torch device, + return torch.device("cpu") + + # Get the f0 via parselmouth computation + def get_f0_pm_computation(self, x, time_step, f0_min, f0_max, p_len): + f0 = ( + parselmouth.Sound(x, self.sr) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=f0_min, + pitch_ceiling=f0_max, ) - pad_size = (p_len - len(f0) + 1) // 2 - if pad_size > 0 or p_len - len(f0) - pad_size > 0: - f0 = np.pad( - f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" - ) - elif f0_method == "harvest": + .selected_array["frequency"] + ) + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad( + f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" + ) + return f0 + + # Get the f0 via the pyworld computation. Fork Feature +dio along with harvest + def get_f0_pyworld_computation(self, x, f0_min, f0_max, f0_type): + if f0_type == "harvest": f0, t = pyworld.harvest( x.astype(np.double), fs=self.sr, @@ -57,8 +69,87 @@ class VC(object): f0_floor=f0_min, frame_period=10, ) - f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr) - f0 = signal.medfilt(f0, 3) + elif f0_type == "dio": + f0, t = pyworld.dio( + x.astype(np.double), + fs=self.sr, + f0_ceil=f0_max, + f0_floor=f0_min, + frame_period=10, + ) + f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr) + f0 = signal.medfilt(f0, 3) + return f0 + + # Fork Feature: Resize f0 for f0 retrieved from torchcrepe Tensor prediction + def resize_f0(self, x, target_len): + source = np.array(x) + source[source < 0.001] = np.nan + target = np.interp( + np.arange(0, len(source) * target_len, len(source)) / target_len, + np.arange(0, len(source)), + source, + ) + resized = np.nan_to_num(target) + return resized + + # Fork Feature: Get the f0 via the crepe algorithm from torchcrepe + def get_f0_crepe_computation( + self, + x, + f0_min, + f0_max, + p_len, + hop_length=128, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time. + model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full + ): + x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float. + x /= np.quantile(np.abs(x), 0.999) + torch_device = self.get_optimal_torch_device() + audio = torch.from_numpy(x).to(torch_device, copy=True) + audio = torch.unsqueeze(audio, dim=0) + + if audio.ndim == 2 and audio.shape[0] > 1: + audio = torch.mean(audio, dim=0, keepdim=True).detach() + audio = audio.detach() + + print("Initiating prediction with a crepe_hop_length of: " + str(hop_length)) + pitch: Tensor = torchcrepe.predict( + audio, + self.sr, + hop_length, + f0_min, + f0_max, + model, + batch_size=hop_length * 2, + device=torch_device, + pad=True + ) + + f0 = pitch.squeeze(0).cpu().float().numpy() + p_len = p_len or x.shape[0] // hop_length + f0 = self.resize_f0(f0, p_len) + return f0 + + #endregion + + def get_f0(self, x, p_len, f0_up_key, f0_method, crepe_hop_length, inp_f0=None): + time_step = self.window / self.sr * 1000 + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + if f0_method == "pm": + f0 = self.get_f0_pm_computation(x, time_step, f0_min, f0_max, p_len) + elif f0_method == "harvest": + f0 = self.get_f0_pyworld_computation(x, f0_min, f0_max, "harvest") + elif f0_method == "dio": # Fork Feature + f0 = self.get_f0_pyworld_computation(x, f0_min, f0_max, "dio") + elif f0_method == "crepe": # Fork Feature: Adding a new f0 algorithm called crepe + f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length) + # Add crepe-tiny method here + + print("Using the following f0 method: " + f0_method) f0 *= pow(2, f0_up_key / 12) # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) tf0 = self.sr // self.window # 每秒f0点数 @@ -82,6 +173,7 @@ class VC(object): f0_mel[f0_mel <= 1] = 1 f0_mel[f0_mel > 255] = 255 f0_coarse = np.rint(f0_mel).astype(np.int) + return f0_coarse, f0bak # 1-0 def vc( @@ -189,6 +281,7 @@ class VC(object): # file_big_npy, index_rate, if_f0, + crepe_hop_length, f0_file=None, ): if ( @@ -243,7 +336,7 @@ class VC(object): sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() pitch, pitchf = None, None if if_f0 == 1: - pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0) + pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, crepe_hop_length, inp_f0) pitch = pitch[:p_len] pitchf = pitchf[:p_len] pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()