From a2eb9742559ea7eccf9cde05205d8b4f92c7ff30 Mon Sep 17 00:00:00 2001 From: Mangio621 Date: Wed, 31 May 2023 00:37:37 +1000 Subject: [PATCH] Crepe Implementation use own hop_length. Update readme --- README.md | 4 ++-- extract_f0_print.py | 10 ++++++---- i18n/zh_SG.json | 4 ---- vc_infer_pipeline.py | 11 ++++++----- 4 files changed, 14 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 393dcb6..fba82b9 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ Right now, the hybrid f0 method is only available on CLI, not GUI yet. But basic Here's how we would infer with the hybrid f0 method in cli: ```bash -MyModel.pth saudio/Source.wav Output.wav logs/mi-test/added.index 0 -2 hybrid[pm+crepe] 128 3 0 1 0.95 +MyModel.pth saudio/Source.wav Output.wav logs/mi-test/added.index 0 -2 hybrid[pm+crepe] 128 3 0 1 0.95 0.33 ``` Notice that the method is "hybrid[pm+crepe]" instead of a singular method like "harvest". @@ -91,7 +91,7 @@ hybrid[pm+harvest+crepe] Many f0 methods may be used. But are to be split with a delimiter of the '+' character. Keep in mind that inference will take much longer as we are calculating f0 X more times. # About the original repo's crepe method, compared to this forks crepe method (mangio-crepe) -The original repos crepe f0 computation method is slightly different to mine. Its arguable that in some areas, my crepe implementation sounds more stable in some parts. However, the orginal repo's crepe implementation gets rid of noise and artifacts much better. In this fork, i've also modified the original crepe computation to use the customizable crepe_hop_length feature on both the GUI and the CLI. Please let it be known, that each implementation sounds slightly different, and there isn't a clear "better" or "worse". It all depends on the context! +The original repos crepe f0 computation method is slightly different to mine. Its arguable that in some areas, my crepe implementation sounds more stable in some parts. However, the orginal repo's crepe implementation gets rid of noise and artifacts much better. In this fork, my own crepe implementation (mangio-crepe) uses a customizable crepe_hop_length feature on both the GUI and the CLI which the original crepe doesnt have. Please let it be known, that each implementation sounds slightly different, and there isn't a clear "better" or "worse". It all depends on the context! If one must be chosen, I highly recommend using the original crepe implementation (not this fork's) as the developers of RVC have more control on fixing issues than I have. diff --git a/extract_f0_print.py b/extract_f0_print.py index e09bcec..d15c666 100644 --- a/extract_f0_print.py +++ b/extract_f0_print.py @@ -93,16 +93,17 @@ class FeatureInput(object): else: torch_device = torch.device("cpu") model = "full" + batch_size = 512 # Compute pitch using first gpu audio = torch.tensor(np.copy(x))[None].float() f0, pd = torchcrepe.predict( audio, self.fs, - crepe_hop_length, + 160, self.f0_min, self.f0_max, model, - batch_size=crepe_hop_length * 2, + batch_size=batch_size, device=torch_device, return_periodicity=True, ) @@ -239,16 +240,17 @@ class FeatureInput(object): else: torch_device = torch.device("cpu") model = "full" + batch_size = 512 # Compute pitch using first gpu audio = torch.tensor(np.copy(x))[None].float() f0, pd = torchcrepe.predict( audio, self.fs, - crepe_hop_length, + 160, self.f0_min, self.f0_max, model, - batch_size=crepe_hop_length * 2, + batch_size=batch_size, device=torch_device, return_periodicity=True, ) diff --git a/i18n/zh_SG.json b/i18n/zh_SG.json index 7a5c4b7..2ee7bf6 100644 --- a/i18n/zh_SG.json +++ b/i18n/zh_SG.json @@ -16,14 +16,10 @@ "男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ": "男性轉女性推薦+12key,女性轉男性推薦-12key,如果音域爆炸導致音色失真也可以自己調整到合適音域。", "变调(整数, 半音数量, 升八度12降八度-12)": "變調(整數、半音數量、升八度12降八度-12)", "输入待处理音频文件路径(默认是正确格式示例)": "輸入待處理音頻檔案路徑(預設是正確格式示例)", -<<<<<<< HEAD "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU", "crepe_hop_length": "Crepe Hop Length (Only applies to crepe): Hop length refers to the time it takes for the speaker to jump to a dramatic pitch. Lower hop lengths take more time to infer but are more pitch accurate.", "特征检索库文件路径": "特徵檢索庫檔案路徑", "特征文件路径": "特徵檔案路徑", -======= - "选择音高提取算法,输入歌声可用pm提速,harvest低音好但巨慢无比,crepe效果好但吃GPU": "選擇音高提取演算法,輸入歌聲可用pm提速,harvest低音好但巨慢無比,crepe效果好但吃GPU", ->>>>>>> upstream/main ">=3则使用对harvest音高识别的结果使用中值滤波,数值为滤波半径,使用可以削弱哑音": ">=3則使用對harvest音高識別的結果使用中值濾波,數值為濾波半徑,使用可以削弱啞音", "特征检索库文件路径,为空则使用下拉的选择结果": "特徵檢索庫檔路徑,為空則使用下拉的選擇結果", "自动检测index路径,下拉式选择(dropdown)": "自動檢測index路徑,下拉式選擇(dropdown)", diff --git a/vc_infer_pipeline.py b/vc_infer_pipeline.py index 740544b..dc940bd 100644 --- a/vc_infer_pipeline.py +++ b/vc_infer_pipeline.py @@ -125,19 +125,20 @@ class VC(object): x, f0_min, f0_max, - hop_length=160, model="full", ): + # Pick a batch size that doesn't cause memory errors on your gpu + batch_size = 512 # Compute pitch using first gpu audio = torch.tensor(np.copy(x))[None].float() f0, pd = torchcrepe.predict( audio, self.sr, - hop_length, + self.window, f0_min, f0_max, model, - batch_size=hop_length * 2, + batch_size=batch_size, device=self.device, return_periodicity=True, ) @@ -197,9 +198,9 @@ class VC(object): f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" ) elif method == "crepe": - f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, crepe_hop_length) + f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max) elif method == "crepe-tiny": - f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, crepe_hop_length, "tiny") + f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny") elif method == "mangio-crepe": f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length) elif method == "mangio-crepe-tiny":