diff --git a/Makefile b/Makefile
index c029f4a..44de020 100644
--- a/Makefile
+++ b/Makefile
@@ -36,10 +36,18 @@ basev1: ## Download version 1 pre-trained models (Do only once after cloning the
basev2: ## Download version 2 pre-trained models (Do only once after cloning the fork)
mkdir -p pretrained_v2 uvr5_weights
git pull
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D32k.pth -d pretrained_v2 -o D32k.pth
aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D40k.pth -d pretrained_v2 -o D40k.pth
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D48k.pth -d pretrained_v2 -o D48k.pth
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G32k.pth -d pretrained_v2 -o G32k.pth
aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G40k.pth -d pretrained_v2 -o G40k.pth
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G48k.pth -d pretrained_v2 -o G48k.pth
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D32k.pth -d pretrained_v2 -o f0D32k.pth
aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D40k.pth -d pretrained_v2 -o f0D40k.pth
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D48k.pth -d pretrained_v2 -o f0D48k.pth
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G32k.pth -d pretrained_v2 -o f0G32k.pth
aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G40k.pth -d pretrained_v2 -o f0G40k.pth
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G48k.pth -d pretrained_v2 -o f0G48k.pth
aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2-人声vocals+非人声instrumentals.pth -d uvr5_weights -o HP2-人声vocals+非人声instrumentals.pth
aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5-主旋律人声vocals+其他instrumentals.pth -d uvr5_weights -o HP5-主旋律人声vocals+其他instrumentals.pth
aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d ./ -o hubert_base.pt
diff --git a/Retrieval_based_Voice_Conversion_WebUI.ipynb b/Retrieval_based_Voice_Conversion_WebUI.ipynb
index 956fed9..d1c95b2 100644
--- a/Retrieval_based_Voice_Conversion_WebUI.ipynb
+++ b/Retrieval_based_Voice_Conversion_WebUI.ipynb
@@ -45,7 +45,7 @@
"!apt-get -y install build-essential python3-dev ffmpeg\n",
"!pip3 install --upgrade setuptools wheel\n",
"!pip3 install --upgrade pip\n",
- "!pip3 install faiss-gpu fairseq gradio ffmpeg ffmpeg-python praat-parselmouth pyworld numpy==1.23.5 numba==0.56.4 librosa==0.9.2"
+ "!pip3 install faiss-cpu==1.7.2 fairseq gradio==3.14.0 ffmpeg ffmpeg-python praat-parselmouth pyworld numpy==1.23.5 numba==0.56.4 librosa==0.9.2"
],
"metadata": {
"id": "wjddIFr1oS3W"
@@ -378,4 +378,4 @@
"outputs": []
}
]
-}
\ No newline at end of file
+}
diff --git a/Retrieval_based_Voice_Conversion_WebUI_v2.ipynb b/Retrieval_based_Voice_Conversion_WebUI_v2.ipynb
index cf9b349..9fad92c 100644
--- a/Retrieval_based_Voice_Conversion_WebUI_v2.ipynb
+++ b/Retrieval_based_Voice_Conversion_WebUI_v2.ipynb
@@ -34,7 +34,7 @@
"!apt-get -y install build-essential python3-dev ffmpeg\n",
"!pip3 install --upgrade setuptools wheel\n",
"!pip3 install --upgrade pip\n",
- "!pip3 install faiss-gpu fairseq gradio ffmpeg ffmpeg-python praat-parselmouth pyworld numpy==1.23.5 numba==0.56.4 librosa==0.9.2"
+ "!pip3 install faiss-cpu==1.7.2 fairseq gradio==3.14.0 ffmpeg ffmpeg-python praat-parselmouth pyworld numpy==1.23.5 numba==0.56.4 librosa==0.9.2"
]
},
{
diff --git a/configs/32k_v2.json b/configs/32k_v2.json
new file mode 100644
index 0000000..70e534f
--- /dev/null
+++ b/configs/32k_v2.json
@@ -0,0 +1,46 @@
+{
+ "train": {
+ "log_interval": 200,
+ "seed": 1234,
+ "epochs": 20000,
+ "learning_rate": 1e-4,
+ "betas": [0.8, 0.99],
+ "eps": 1e-9,
+ "batch_size": 4,
+ "fp16_run": true,
+ "lr_decay": 0.999875,
+ "segment_size": 12800,
+ "init_lr_ratio": 1,
+ "warmup_epochs": 0,
+ "c_mel": 45,
+ "c_kl": 1.0
+ },
+ "data": {
+ "max_wav_value": 32768.0,
+ "sampling_rate": 32000,
+ "filter_length": 1024,
+ "hop_length": 320,
+ "win_length": 1024,
+ "n_mel_channels": 80,
+ "mel_fmin": 0.0,
+ "mel_fmax": null
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0,
+ "resblock": "1",
+ "resblock_kernel_sizes": [3,7,11],
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+ "upsample_rates": [10,8,2,2],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [20,16,4,4],
+ "use_spectral_norm": false,
+ "gin_channels": 256,
+ "spk_embed_dim": 109
+ }
+}
diff --git a/configs/48k_v2.json b/configs/48k_v2.json
new file mode 100644
index 0000000..10f1ce7
--- /dev/null
+++ b/configs/48k_v2.json
@@ -0,0 +1,46 @@
+{
+ "train": {
+ "log_interval": 200,
+ "seed": 1234,
+ "epochs": 20000,
+ "learning_rate": 1e-4,
+ "betas": [0.8, 0.99],
+ "eps": 1e-9,
+ "batch_size": 4,
+ "fp16_run": true,
+ "lr_decay": 0.999875,
+ "segment_size": 11520,
+ "init_lr_ratio": 1,
+ "warmup_epochs": 0,
+ "c_mel": 45,
+ "c_kl": 1.0
+ },
+ "data": {
+ "max_wav_value": 32768.0,
+ "sampling_rate": 48000,
+ "filter_length": 2048,
+ "hop_length": 480,
+ "win_length": 2048,
+ "n_mel_channels": 128,
+ "mel_fmin": 0.0,
+ "mel_fmax": null
+ },
+ "model": {
+ "inter_channels": 192,
+ "hidden_channels": 192,
+ "filter_channels": 768,
+ "n_heads": 2,
+ "n_layers": 6,
+ "kernel_size": 3,
+ "p_dropout": 0,
+ "resblock": "1",
+ "resblock_kernel_sizes": [3,7,11],
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+ "upsample_rates": [12,10,2,2],
+ "upsample_initial_channel": 512,
+ "upsample_kernel_sizes": [24,20,4,4],
+ "use_spectral_norm": false,
+ "gin_channels": 256,
+ "spk_embed_dim": 109
+ }
+}
diff --git a/extract_feature_print.py b/extract_feature_print.py
index bfcf33d..cfc6e75 100644
--- a/extract_feature_print.py
+++ b/extract_feature_print.py
@@ -63,6 +63,13 @@ def readwave(wav_path, normalize=False):
# HuBERT model
printt("load model(s) from {}".format(model_path))
+# if hubert model is exist
+if os.access(model_path, os.F_OK) == False:
+ printt(
+ "Error: Extracting is shut down because %s does not exist, you may download it from https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main"
+ % model_path
+ )
+ exit(0)
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
[model_path],
suffix="",
diff --git a/gui.py b/gui.py
index b54ea24..bbda3c7 100644
--- a/gui.py
+++ b/gui.py
@@ -10,7 +10,7 @@
增加无索引支持
f0算法改harvest(怎么看就只有这个会影响CPU占用),但是不这么改效果不好
"""
-import os, sys, traceback
+import os, sys, traceback, re
import json
@@ -518,6 +518,7 @@ class GUI:
self.flag_vc = False
exit()
if event == "start_vc" and self.flag_vc == False:
+<<<<<<< HEAD
self.set_values(values)
print("using_cuda:" + str(torch.cuda.is_available()))
self.start_vc()
@@ -536,6 +537,25 @@ class GUI:
}
with open("values1.json", "w") as j:
json.dump(settings, j)
+=======
+ if self.set_values(values) == True:
+ print("using_cuda:" + str(torch.cuda.is_available()))
+ self.start_vc()
+ settings = {
+ "pth_path": values["pth_path"],
+ "index_path": values["index_path"],
+ "sg_input_device": values["sg_input_device"],
+ "sg_output_device": values["sg_output_device"],
+ "threhold": values["threhold"],
+ "pitch": values["pitch"],
+ "index_rate": values["index_rate"],
+ "block_time": values["block_time"],
+ "crossfade_length": values["crossfade_length"],
+ "extra_time": values["extra_time"],
+ }
+ with open("values1.json", "w") as j:
+ json.dump(settings, j)
+>>>>>>> upstream/main
if event == "stop_vc" and self.flag_vc == True:
self.flag_vc = False
@@ -556,6 +576,22 @@ class GUI:
return used_f0
def set_values(self, values):
+ if len(values["pth_path"].strip()) == 0:
+ sg.popup(i18n("请选择pth文件"))
+ return False
+ if len(values["index_path"].strip()) == 0:
+ sg.popup(i18n("请选择index文件"))
+ return False
+ pattern = re.compile("[^\x00-\x7F]+")
+ if pattern.findall(values["hubert_path"]):
+ sg.popup(i18n("hubert模型路径不可包含中文"))
+ return False
+ if pattern.findall(values["pth_path"]):
+ sg.popup(i18n("pth文件路径不可包含中文"))
+ return False
+ if pattern.findall(values["index_path"]):
+ sg.popup(i18n("index文件路径不可包含中文"))
+ return False
self.set_devices(values["sg_input_device"], values["sg_output_device"])
self.config.hubert_path = os.path.join(current_dir, "hubert_base.pt")
self.config.pth_path = values["pth_path"]
@@ -570,6 +606,7 @@ class GUI:
self.config.I_noise_reduce = values["I_noise_reduce"]
self.config.O_noise_reduce = values["O_noise_reduce"]
self.config.index_rate = values["index_rate"]
+ return True
def start_vc(self):
torch.cuda.empty_cache()
diff --git a/i18n/en_US.json b/i18n/en_US.json
index 4ea9351..1cb5f9e 100644
--- a/i18n/en_US.json
+++ b/i18n/en_US.json
@@ -46,7 +46,7 @@
"输入实验名": "Enter the experiment name:",
"目标采样率": "Target sample rate:",
"模型是否带音高指导(唱歌一定要, 语音可以不要)": "Whether the model has pitch guidance (required for singing, optional for speech):",
- "版本(目前仅40k支持了v2)": "Version (currently only v2 is supported for 40k sample rate):",
+ "版本": "Version",
"提取音高和处理数据使用的CPU进程数": "Number of CPU processes used for pitch extraction and data processing:",
"step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Step 2a: Automatically traverse all files in the training folder that can be decoded into audio and perform slice normalization. Generates 2 wav folders in the experiment directory. Currently, only single-singer/speaker training is supported.",
"输入训练文件夹路径": "Enter the path of the training folder:",
@@ -122,7 +122,7 @@
"开始音频转换": "Start audio conversion",
"停止音频转换": "Stop audio conversion",
"推理时间(ms):": "Inference time (ms):",
- "伴奏人声分离&去混响&去回声": "Vocal/Accompaniment Separation & De-reverb & De-echo",
+ "人声伴奏分离批量处理, 使用UVR5模型。
合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。
模型分为三类:
1、保留人声:不带和声的音频选这个,对主人声保留比HP5更好。内置HP2和HP3两个模型,HP3可能轻微漏伴奏但对主人声保留比HP2稍微好一丁点;
2、仅保留主人声:带和声的音频选这个,对主人声可能有削弱。内置HP5一个模型;
3、去混响、去延迟模型(by FoxJoy):
(1)MDX-Net(onnx_dereverb):对于双通道混响是最好的选择,不能去除单通道混响;
(234)DeEcho:去除延迟效果。Aggressive比Normal去除得更彻底,DeReverb额外去除混响,可去除单声道混响,但是对高频重的板式混响去不干净。
去混响/去延迟,附:
1、DeEcho-DeReverb模型的耗时是另外2个DeEcho模型的接近2倍;
2、MDX-Net-Dereverb模型挺慢的;
3、个人推荐的最干净的配置是先MDX-Net再DeEcho-Aggressive。":"Batch processing for vocal accompaniment separation using the UVR5 model.
Example of a valid folder path format: D:\\path\\to\\input\\folder (copy it from the file manager address bar).
The model is divided into three categories:
1. Preserve vocals: Choose this option for audio without harmonies. It preserves vocals better than HP5. It includes two built-in models: HP2 and HP3. HP3 may slightly leak accompaniment but preserves vocals slightly better than HP2.
2. Preserve main vocals only: Choose this option for audio with harmonies. It may weaken the main vocals. It includes one built-in model: HP5.
3. De-reverb and de-delay models (by FoxJoy):
(1) MDX-Net: The best choice for stereo reverb removal but cannot remove mono reverb;
(234) DeEcho: Removes delay effects. Aggressive mode removes more thoroughly than Normal mode. DeReverb additionally removes reverb and can remove mono reverb, but not very effectively for heavily reverberated high-frequency content.
De-reverb/de-delay notes:
1. The processing time for the DeEcho-DeReverb model is approximately twice as long as the other two DeEcho models.
2. The MDX-Net-Dereverb model is quite slow.
3. The recommended cleanest configuration is to apply MDX-Net first and then DeEcho-Aggressive.",
"人声伴奏分离批量处理, 使用UVR5模型。": "Batch processing for vocal accompaniment separation using the UVR5 model.",
"合格的文件夹路径格式举例: E:\\codes\\py39\\vits_vc_gpu\\白鹭霜华测试样例(去文件管理器地址栏拷就行了)。": "Example of a valid folder path format: D:\\path\\to\\input\\folder (copy it from the file manager address bar).",
"模型分为三类:": "The model is divided into three categories:",
diff --git a/i18n/es_ES.json b/i18n/es_ES.json
index a27be6c..8785e83 100644
--- a/i18n/es_ES.json
+++ b/i18n/es_ES.json
@@ -46,7 +46,7 @@
"输入实验名": "Ingrese el nombre del modelo",
"目标采样率": "Tasa de muestreo objetivo",
"模型是否带音高指导(唱歌一定要, 语音可以不要)": "Si el modelo tiene guía de tono (necesaria para cantar, pero no para hablar)",
- "版本(目前仅40k支持了v2)": "版本(目前仅40k支持了v2)",
+ "版本": "版本",
"提取音高和处理数据使用的CPU进程数": "提取音高和处理数据使用的CPU进程数",
"step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "paso 2a: recorra automáticamente la carpeta de capacitación y corte y normalice todos los archivos de audio que se pueden decodificar en audio. Se generarán dos carpetas 'wav' en el directorio del experimento. Actualmente, solo se admite la capacitación de una sola persona.",
"输入训练文件夹路径": "Introduzca la ruta de la carpeta de entrenamiento",
diff --git a/i18n/ja_JP.json b/i18n/ja_JP.json
index 2efb312..6470349 100644
--- a/i18n/ja_JP.json
+++ b/i18n/ja_JP.json
@@ -46,7 +46,7 @@
"输入实验名": "モデル名",
"目标采样率": "目標サンプリングレート",
"模型是否带音高指导(唱歌一定要, 语音可以不要)": "モデルに音高ガイドがあるかどうか(歌唱には必要ですが、音声には必要ありません)",
- "版本(目前仅40k支持了v2)": "バージョン(現在v2をサポートしているのは40kのみ)",
+ "版本": "バージョン",
"提取音高和处理数据使用的CPU进程数": "ピッチの抽出やデータ処理に使用するCPUスレッド数",
"step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "ステップ2a: 訓練フォルダー内のすべての音声ファイルを自動的に探索し、スライスと正規化を行い、2つのwavフォルダーを実験ディレクトリに生成します。現在は一人でのトレーニングのみをサポートしています。",
"输入训练文件夹路径": "トレーニング用フォルダのパスを入力してください",
diff --git a/i18n/zh_CN.json b/i18n/zh_CN.json
index 543f79a..d7fbe21 100644
--- a/i18n/zh_CN.json
+++ b/i18n/zh_CN.json
@@ -46,7 +46,7 @@
"输入实验名": "输入实验名",
"目标采样率": "目标采样率",
"模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否带音高指导(唱歌一定要, 语音可以不要)",
- "版本(目前仅40k支持了v2)": "版本(目前仅40k支持了v2)",
+ "版本": "版本",
"提取音高和处理数据使用的CPU进程数": "提取音高和处理数据使用的CPU进程数",
"step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ",
"输入训练文件夹路径": "输入训练文件夹路径",
diff --git a/i18n/zh_HK.json b/i18n/zh_HK.json
index dc37397..faa1085 100644
--- a/i18n/zh_HK.json
+++ b/i18n/zh_HK.json
@@ -36,7 +36,7 @@
"输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)",
"也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量輸入音頻檔案,二選一,優先讀資料夾",
"导出文件格式": "導出檔格式",
- "伴奏人声分离&去混响&去回声": "伴奏人声分离&去混响&去回声",
+ "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲",
"输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑",
"模型": "模型",
"指定输出主人声文件夹": "指定输出主人声文件夹",
@@ -46,7 +46,7 @@
"输入实验名": "輸入實驗名稱",
"目标采样率": "目標取樣率",
"模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導(唱歌一定要,語音可以不要)",
- "版本(目前仅40k支持了v2)": "版本(目前僅40k支持了v2)",
+ "版本": "版本",
"提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數",
"step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。",
"输入训练文件夹路径": "輸入訓練檔案夾路徑",
diff --git a/i18n/zh_SG.json b/i18n/zh_SG.json
index 2ee7bf6..385bc35 100644
--- a/i18n/zh_SG.json
+++ b/i18n/zh_SG.json
@@ -36,7 +36,7 @@
"输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)",
"也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量輸入音頻檔案,二選一,優先讀資料夾",
"导出文件格式": "導出檔格式",
- "伴奏人声分离&去混响&去回声": "伴奏人声分离&去混响&去回声",
+ "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲",
"输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑",
"模型": "模型",
"指定输出主人声文件夹": "指定输出主人声文件夹",
@@ -46,7 +46,7 @@
"输入实验名": "輸入實驗名稱",
"目标采样率": "目標取樣率",
"模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導(唱歌一定要,語音可以不要)",
- "版本(目前仅40k支持了v2)": "版本(目前僅40k支持了v2)",
+ "版本": "版本",
"提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數",
"step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。",
"输入训练文件夹路径": "輸入訓練檔案夾路徑",
diff --git a/i18n/zh_TW.json b/i18n/zh_TW.json
index b9b6283..b243182 100644
--- a/i18n/zh_TW.json
+++ b/i18n/zh_TW.json
@@ -36,7 +36,7 @@
"输入待处理音频文件夹路径(去文件管理器地址栏拷就行了)": "輸入待處理音頻資料夾路徑(去檔案管理器地址欄拷貝即可)",
"也可批量输入音频文件, 二选一, 优先读文件夹": "也可批量輸入音頻檔案,二選一,優先讀資料夾",
"导出文件格式": "導出檔格式",
- "伴奏人声分离&去混响&去回声": "伴奏人声分离&去混响&去回声",
+ "伴奏人声分离&去混响&去回声": "伴奏人聲分離&去混響&去回聲",
"输入待处理音频文件夹路径": "輸入待處理音頻資料夾路徑",
"模型": "模型",
"指定输出主人声文件夹": "指定输出主人声文件夹",
@@ -46,7 +46,7 @@
"输入实验名": "輸入實驗名稱",
"目标采样率": "目標取樣率",
"模型是否带音高指导(唱歌一定要, 语音可以不要)": "模型是否帶音高指導(唱歌一定要,語音可以不要)",
- "版本(目前仅40k支持了v2)": "版本(目前僅40k支持了v2)",
+ "版本": "版本",
"提取音高和处理数据使用的CPU进程数": "提取音高和處理數據使用的CPU進程數",
"step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。",
"输入训练文件夹路径": "輸入訓練檔案夾路徑",
diff --git a/infer-web.py b/infer-web.py
index c7de95b..60f981b 100644
--- a/infer-web.py
+++ b/infer-web.py
@@ -1,15 +1,48 @@
-import torch, os, traceback, sys, warnings, shutil, numpy as np
+import os
+import shutil
+import sys
-os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
-import threading
-from time import sleep
-from subprocess import Popen
-import faiss
-from random import shuffle
-import json
+import json # Mangio fork using json for preset saving
now_dir = os.getcwd()
sys.path.append(now_dir)
+import traceback, pdb
+import warnings
+
+import numpy as np
+import torch
+
+os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1"
+import logging
+import threading
+from random import shuffle
+from subprocess import Popen
+from time import sleep
+
+import faiss
+import ffmpeg
+import gradio as gr
+import soundfile as sf
+from config import Config
+from fairseq import checkpoint_utils
+from i18n import I18nAuto
+from infer_pack.models import (
+ SynthesizerTrnMs256NSFsid,
+ SynthesizerTrnMs256NSFsid_nono,
+ SynthesizerTrnMs768NSFsid,
+ SynthesizerTrnMs768NSFsid_nono,
+)
+from infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
+from infer_uvr5 import _audio_pre_, _audio_pre_new
+from MDXNet import MDXNetDereverb
+from my_utils import load_audio
+from train.process_ckpt import change_info, extract_small_model, merge, show_info
+from vc_infer_pipeline import VC
+from sklearn.cluster import MiniBatchKMeans
+
+logging.getLogger("numba").setLevel(logging.WARNING)
+
+
tmp = os.path.join(now_dir, "TEMP")
shutil.rmtree(tmp, ignore_errors=True)
shutil.rmtree("%s/runtime/Lib/site-packages/infer_pack" % (now_dir), ignore_errors=True)
@@ -20,41 +53,44 @@ os.makedirs(os.path.join(now_dir, "weights"), exist_ok=True)
os.environ["TEMP"] = tmp
warnings.filterwarnings("ignore")
torch.manual_seed(114514)
-from i18n import I18nAuto
-import ffmpeg
-from MDXNet import MDXNetDereverb
+
+config = Config()
i18n = I18nAuto()
i18n.print()
# 判断是否有能用来训练和加速推理的N卡
ngpu = torch.cuda.device_count()
gpu_infos = []
mem = []
-if (not torch.cuda.is_available()) or ngpu == 0:
- if_gpu_ok = False
-else:
- if_gpu_ok = False
+if_gpu_ok = False
+
+if torch.cuda.is_available() or ngpu != 0:
for i in range(ngpu):
gpu_name = torch.cuda.get_device_name(i)
- if (
- "10" in gpu_name
- or "16" in gpu_name
- or "20" in gpu_name
- or "30" in gpu_name
- or "40" in gpu_name
- or "A2" in gpu_name.upper()
- or "A3" in gpu_name.upper()
- or "A4" in gpu_name.upper()
- or "P4" in gpu_name.upper()
- or "A50" in gpu_name.upper()
- or "A60" in gpu_name.upper()
- or "70" in gpu_name
- or "80" in gpu_name
- or "90" in gpu_name
- or "M4" in gpu_name.upper()
- or "T4" in gpu_name.upper()
- or "TITAN" in gpu_name.upper()
- ): # A10#A100#V100#A40#P40#M40#K80#A4500
+ if any(
+ value in gpu_name.upper()
+ for value in [
+ "10",
+ "16",
+ "20",
+ "30",
+ "40",
+ "A2",
+ "A3",
+ "A4",
+ "P4",
+ "A50",
+ "500",
+ "A60",
+ "70",
+ "80",
+ "90",
+ "M4",
+ "T4",
+ "TITAN",
+ ]
+ ):
+ # A10#A100#V100#A40#P40#M40#K80#A4500
if_gpu_ok = True # 至少有一张能用的N卡
gpu_infos.append("%s\t%s" % (i, gpu_name))
mem.append(
@@ -66,32 +102,13 @@ else:
+ 0.4
)
)
-if if_gpu_ok == True and len(gpu_infos) > 0:
+if if_gpu_ok and len(gpu_infos) > 0:
gpu_info = "\n".join(gpu_infos)
default_batch_size = min(mem) // 2
else:
gpu_info = i18n("很遗憾您这没有能用的显卡来支持您训练")
default_batch_size = 1
gpus = "-".join([i[0] for i in gpu_infos])
-from infer_pack.models import (
- SynthesizerTrnMs256NSFsid,
- SynthesizerTrnMs256NSFsid_nono,
- SynthesizerTrnMs768NSFsid,
- SynthesizerTrnMs768NSFsid_nono,
-)
-import soundfile as sf
-from fairseq import checkpoint_utils
-import gradio as gr
-import logging
-from vc_infer_pipeline import VC
-from config import Config
-from infer_uvr5 import _audio_pre_, _audio_pre_new
-from my_utils import load_audio
-from train.process_ckpt import show_info, change_info, merge, extract_small_model
-
-config = Config()
-# from trainset_preprocess_pipeline import PreProcess
-logging.getLogger("numba").setLevel(logging.WARNING)
class ToolButton(gr.Button, gr.components.FormComponent):
@@ -166,7 +183,7 @@ def vc_single(
if audio_max > 1:
audio /= audio_max
times = [0, 0, 0]
- if hubert_model == None:
+ if not hubert_model:
load_hubert()
if_f0 = cpt.get("f0", 1)
file_index = (
@@ -206,7 +223,7 @@ def vc_single(
crepe_hop_length,
f0_file=f0_file,
)
- if resample_sr >= 16000 and tgt_sr != resample_sr:
+ if tgt_sr != resample_sr >= 16000:
tgt_sr = resample_sr
index_info = (
"Using index:%s." % file_index
@@ -386,11 +403,11 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format
# 一个选项卡全局只能有一个音色
-def get_vc(sid):
+def get_vc(sid, to_return_protect0, to_return_protect1):
global n_spk, tgt_sr, net_g, vc, cpt, version
if sid == "" or sid == []:
global hubert_model
- if hubert_model != None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
+ if hubert_model is not None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
print("clean_empty_cache")
del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt
hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
@@ -424,6 +441,23 @@ def get_vc(sid):
tgt_sr = cpt["config"][-1]
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
if_f0 = cpt.get("f0", 1)
+ if if_f0 == 0:
+ to_return_protect0 = to_return_protect1 = {
+ "visible": False,
+ "value": 0.5,
+ "__type__": "update",
+ }
+ else:
+ to_return_protect0 = {
+ "visible": True,
+ "value": to_return_protect0,
+ "__type__": "update",
+ }
+ to_return_protect1 = {
+ "visible": True,
+ "value": to_return_protect1,
+ "__type__": "update",
+ }
version = cpt.get("version", "v1")
if version == "v1":
if if_f0 == 1:
@@ -444,7 +478,11 @@ def get_vc(sid):
net_g = net_g.float()
vc = VC(tgt_sr, config)
n_spk = cpt["config"][-3]
- return {"visible": True, "maximum": n_spk, "__type__": "update"}
+ return (
+ {"visible": True, "maximum": n_spk, "__type__": "update"},
+ to_return_protect0,
+ to_return_protect1,
+ )
def change_choices():
@@ -476,7 +514,7 @@ sr_dict = {
def if_done(done, p):
while 1:
- if p.poll() == None:
+ if p.poll() is None:
sleep(0.5)
else:
break
@@ -489,7 +527,7 @@ def if_done_multi(done, ps):
# 只要有一个进程未结束都不停
flag = 1
for p in ps:
- if p.poll() == None:
+ if p.poll() is None:
flag = 0
sleep(0.5)
break
@@ -524,7 +562,7 @@ def preprocess_dataset(trainset_dir, exp_dir, sr, n_p):
with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f:
yield (f.read())
sleep(1)
- if done[0] == True:
+ if done[0]:
break
with open("%s/logs/%s/preprocess.log" % (now_dir, exp_dir), "r") as f:
log = f.read()
@@ -563,7 +601,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, echl):
) as f:
yield (f.read())
sleep(1)
- if done[0] == True:
+ if done[0]:
break
with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
log = f.read()
@@ -611,7 +649,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, echl):
with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
yield (f.read())
sleep(1)
- if done[0] == True:
+ if done[0]:
break
with open("%s/logs/%s/extract_f0_feature.log" % (now_dir, exp_dir), "r") as f:
log = f.read()
@@ -622,51 +660,106 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, echl):
def change_sr2(sr2, if_f0_3, version19):
path_str = "" if version19 == "v1" else "_v2"
f0_str = "f0" if if_f0_3 else ""
- if_pretrained_generator_exist = os.access("pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK)
- if_pretrained_discriminator_exist = os.access("pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK)
- if (if_pretrained_generator_exist == False):
- print("pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model")
- if (if_pretrained_discriminator_exist == False):
- print("pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model")
- return (
- ("pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2)) if if_pretrained_generator_exist else "",
- ("pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2)) if if_pretrained_discriminator_exist else "",
- {"visible": True, "__type__": "update"}
+ if_pretrained_generator_exist = os.access(
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK
)
+ if_pretrained_discriminator_exist = os.access(
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK
+ )
+ if if_pretrained_generator_exist is not False:
+ print(
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2),
+ "not exist, will not use pretrained model",
+ )
+ if if_pretrained_discriminator_exist is not False:
+ print(
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2),
+ "not exist, will not use pretrained model",
+ )
+ return (
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2)
+ if if_pretrained_generator_exist
+ else "",
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2)
+ if if_pretrained_discriminator_exist
+ else "",
+ {"visible": True, "__type__": "update"},
+ )
+
def change_version19(sr2, if_f0_3, version19):
path_str = "" if version19 == "v1" else "_v2"
+ if sr2 == "32k" and version19 == "v1":
+ sr2 = "40k"
+ to_return_sr2 = (
+ {"choices": ["40k", "48k"], "__type__": "update"}
+ if version19 == "v1"
+ else {"choices": ["32k", "40k", "48k"], "__type__": "update"}
+ )
f0_str = "f0" if if_f0_3 else ""
- if_pretrained_generator_exist = os.access("pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK)
- if_pretrained_discriminator_exist = os.access("pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK)
- if (if_pretrained_generator_exist == False):
- print("pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model")
- if (if_pretrained_discriminator_exist == False):
- print("pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model")
+ if_pretrained_generator_exist = os.access(
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK
+ )
+ if_pretrained_discriminator_exist = os.access(
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK
+ )
+ if not if_pretrained_generator_exist:
+ print(
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2),
+ "not exist, will not use pretrained model",
+ )
+ if not if_pretrained_discriminator_exist:
+ print(
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2),
+ "not exist, will not use pretrained model",
+ )
return (
- ("pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2)) if if_pretrained_generator_exist else "",
- ("pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2)) if if_pretrained_discriminator_exist else "",
+ "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2)
+ if if_pretrained_generator_exist
+ else "",
+ "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2)
+ if if_pretrained_discriminator_exist
+ else "",
+ to_return_sr2,
)
def change_f0(if_f0_3, sr2, version19): # f0method8,pretrained_G14,pretrained_D15
path_str = "" if version19 == "v1" else "_v2"
- if_pretrained_generator_exist = os.access("pretrained%s/f0G%s.pth" % (path_str, sr2), os.F_OK)
- if_pretrained_discriminator_exist = os.access("pretrained%s/f0D%s.pth" % (path_str, sr2), os.F_OK)
- if (if_pretrained_generator_exist == False):
- print("pretrained%s/f0G%s.pth" % (path_str, sr2), "not exist, will not use pretrained model")
- if (if_pretrained_discriminator_exist == False):
- print("pretrained%s/f0D%s.pth" % (path_str, sr2), "not exist, will not use pretrained model")
+ if_pretrained_generator_exist = os.access(
+ "pretrained%s/f0G%s.pth" % (path_str, sr2), os.F_OK
+ )
+ if_pretrained_discriminator_exist = os.access(
+ "pretrained%s/f0D%s.pth" % (path_str, sr2), os.F_OK
+ )
+ if not if_pretrained_generator_exist:
+ print(
+ "pretrained%s/f0G%s.pth" % (path_str, sr2),
+ "not exist, will not use pretrained model",
+ )
+ if not if_pretrained_discriminator_exist:
+ print(
+ "pretrained%s/f0D%s.pth" % (path_str, sr2),
+ "not exist, will not use pretrained model",
+ )
if if_f0_3:
return (
{"visible": True, "__type__": "update"},
- "pretrained%s/f0G%s.pth" % (path_str, sr2) if if_pretrained_generator_exist else "",
- "pretrained%s/f0D%s.pth" % (path_str, sr2) if if_pretrained_discriminator_exist else "",
+ "pretrained%s/f0G%s.pth" % (path_str, sr2)
+ if if_pretrained_generator_exist
+ else "",
+ "pretrained%s/f0D%s.pth" % (path_str, sr2)
+ if if_pretrained_discriminator_exist
+ else "",
)
return (
{"visible": False, "__type__": "update"},
- ("pretrained%s/G%s.pth" % (path_str, sr2)) if if_pretrained_generator_exist else "",
- ("pretrained%s/D%s.pth" % (path_str, sr2)) if if_pretrained_discriminator_exist else "",
+ ("pretrained%s/G%s.pth" % (path_str, sr2))
+ if if_pretrained_generator_exist
+ else "",
+ ("pretrained%s/D%s.pth" % (path_str, sr2))
+ if if_pretrained_discriminator_exist
+ else "",
)
@@ -773,8 +866,8 @@ def click_train(
gpus16,
total_epoch11,
save_epoch10,
- ("-pg %s" % pretrained_G14) if pretrained_G14 != "" else "",
- ("-pd %s" % pretrained_D15) if pretrained_D15 != "" else "",
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "",
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "",
1 if if_save_latest13 == i18n("是") else 0,
1 if if_cache_gpu17 == i18n("是") else 0,
1 if if_save_every_weights18 == i18n("是") else 0,
@@ -792,8 +885,8 @@ def click_train(
batch_size12,
total_epoch11,
save_epoch10,
- ("-pg %s" % pretrained_G14) if pretrained_G14 != "" else "\b",
- ("-pd %s" % pretrained_D15) if pretrained_D15 != "" else "\b",
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "\b",
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "\b",
1 if if_save_latest13 == i18n("是") else 0,
1 if if_cache_gpu17 == i18n("是") else 0,
1 if if_save_every_weights18 == i18n("是") else 0,
@@ -815,11 +908,12 @@ def train_index(exp_dir1, version19):
if version19 == "v1"
else "%s/3_feature768" % (exp_dir)
)
- if os.path.exists(feature_dir) == False:
+ if not os.path.exists(feature_dir):
return "请先进行特征提取!"
listdir_res = list(os.listdir(feature_dir))
if len(listdir_res) == 0:
return "请先进行特征提取!"
+ infos = []
npys = []
for name in sorted(listdir_res):
phone = np.load("%s/%s" % (feature_dir, name))
@@ -828,10 +922,30 @@ def train_index(exp_dir1, version19):
big_npy_idx = np.arange(big_npy.shape[0])
np.random.shuffle(big_npy_idx)
big_npy = big_npy[big_npy_idx]
+ if big_npy.shape[0] > 2e5:
+ # if(1):
+ infos.append("Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0])
+ yield "\n".join(infos)
+ try:
+ big_npy = (
+ MiniBatchKMeans(
+ n_clusters=10000,
+ verbose=True,
+ batch_size=256 * config.n_cpu,
+ compute_labels=False,
+ init="random",
+ )
+ .fit(big_npy)
+ .cluster_centers_
+ )
+ except:
+ info = traceback.format_exc()
+ print(info)
+ infos.append(info)
+ yield "\n".join(infos)
+
np.save("%s/total_fea.npy" % exp_dir, big_npy)
- # n_ivf = big_npy.shape[0] // 39
n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
- infos = []
infos.append("%s,%s" % (big_npy.shape, n_ivf))
yield "\n".join(infos)
index = faiss.index_factory(256 if version19 == "v1" else 768, "IVF%s,Flat" % n_ivf)
@@ -1022,7 +1136,7 @@ def train1key(
if gpus16:
cmd = (
config.python_cmd
- +" train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s"
+ + " train_nsf_sim_cache_sid_load_pretrain.py -e %s -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s"
% (
exp_dir1,
sr2,
@@ -1031,8 +1145,8 @@ def train1key(
gpus16,
total_epoch11,
save_epoch10,
- ("-pg %s" % pretrained_G14) if pretrained_G14 != "" else "",
- ("-pd %s" % pretrained_D15) if pretrained_D15 != "" else "",
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "",
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "",
1 if if_save_latest13 == i18n("是") else 0,
1 if if_cache_gpu17 == i18n("是") else 0,
1 if if_save_every_weights18 == i18n("是") else 0,
@@ -1050,8 +1164,8 @@ def train1key(
batch_size12,
total_epoch11,
save_epoch10,
- ("-pg %s" % pretrained_G14) if pretrained_G14 != "" else "",
- ("-pd %s" % pretrained_D15) if pretrained_D15 != "" else "",
+ "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "",
+ "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "",
1 if if_save_latest13 == i18n("是") else 0,
1 if if_cache_gpu17 == i18n("是") else 0,
1 if if_save_every_weights18 == i18n("是") else 0,
@@ -1073,6 +1187,29 @@ def train1key(
big_npy_idx = np.arange(big_npy.shape[0])
np.random.shuffle(big_npy_idx)
big_npy = big_npy[big_npy_idx]
+
+ if big_npy.shape[0] > 2e5:
+ # if(1):
+ info = "Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0]
+ print(info)
+ yield get_info_str(info)
+ try:
+ big_npy = (
+ MiniBatchKMeans(
+ n_clusters=10000,
+ verbose=True,
+ batch_size=256 * config.n_cpu,
+ compute_labels=False,
+ init="random",
+ )
+ .fit(big_npy)
+ .cluster_centers_
+ )
+ except:
+ info = traceback.format_exc()
+ print(info)
+ yield get_info_str(info)
+
np.save("%s/total_fea.npy" % model_log_dir, big_npy)
# n_ivf = big_npy.shape[0] // 39
@@ -1106,10 +1243,7 @@ def train1key(
# ckpt_path2.change(change_info_,[ckpt_path2],[sr__,if_f0__])
def change_info_(ckpt_path):
- if (
- os.path.exists(ckpt_path.replace(os.path.basename(ckpt_path), "train.log"))
- == False
- ):
+ if not os.path.exists(ckpt_path.replace(os.path.basename(ckpt_path), "train.log")):
return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"}
try:
with open(
@@ -1124,15 +1258,12 @@ def change_info_(ckpt_path):
return {"__type__": "update"}, {"__type__": "update"}, {"__type__": "update"}
-from infer_pack.models_onnx import SynthesizerTrnMsNSFsidM
-
-
-def export_onnx(ModelPath, ExportedPath, MoeVS=True):
+def export_onnx(ModelPath, ExportedPath):
cpt = torch.load(ModelPath, map_location="cpu")
- cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
- hidden_channels = 256 if cpt.get("version","v1")=="v1"else 768#cpt["config"][-2] # hidden_channels,为768Vec做准备
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
+ vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768
- test_phone = torch.rand(1, 200, hidden_channels) # hidden unit
+ test_phone = torch.rand(1, 200, vec_channels) # hidden unit
test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用)
test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹)
test_pitchf = torch.rand(1, 200) # nsf基频
@@ -1143,7 +1274,7 @@ def export_onnx(ModelPath, ExportedPath, MoeVS=True):
net_g = SynthesizerTrnMsNSFsidM(
- *cpt["config"], is_half=False,version=cpt.get("version","v1")
+ *cpt["config"], is_half=False, version=cpt.get("version", "v1")
) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16)
net_g.load_state_dict(cpt["weight"], strict=False)
input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"]
@@ -1169,7 +1300,7 @@ def export_onnx(ModelPath, ExportedPath, MoeVS=True):
"rnd": [2],
},
do_constant_folding=False,
- opset_version=16,
+ opset_version=13,
verbose=False,
input_names=input_names,
output_names=output_names,
@@ -1519,11 +1650,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
interactive=True,
)
clean_button.click(fn=clean, inputs=[], outputs=[sid0])
- sid0.change(
- fn=get_vc,
- inputs=[sid0],
- outputs=[spk_item],
- )
with gr.Group():
gr.Markdown(
value=i18n("男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ")
@@ -1757,6 +1883,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
],
[vc_output3],
)
+ sid0.change(
+ fn=get_vc,
+ inputs=[sid0, protect0, protect1],
+ outputs=[spk_item, protect0, protect1],
+ )
with gr.TabItem(i18n("伴奏人声分离&去混响&去回声")):
with gr.Group():
gr.Markdown(
@@ -1843,7 +1974,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
interactive=True,
)
version19 = gr.Radio(
- label=i18n("版本(目前仅40k支持了v2)"),
+ label=i18n("版本"),
choices=["v1", "v2"],
value="v1",
interactive=True,
@@ -1854,7 +1985,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
maximum=config.n_cpu,
step=1,
label=i18n("提取音高和处理数据使用的CPU进程数"),
- value=config.n_cpu,
+ value=int(np.ceil(config.n_cpu / 1.5)),
interactive=True,
)
with gr.Group(): # 暂时单人的, 后面支持最多4人的#数据处理
@@ -1980,7 +2111,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
version19.change(
change_version19,
[sr2, if_f0_3, version19],
- [pretrained_G14, pretrained_D15],
+ [pretrained_G14, pretrained_D15, sr2],
)
if_f0_3.change(
change_f0,
@@ -2058,7 +2189,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
with gr.Row():
sr_ = gr.Radio(
label=i18n("目标采样率"),
- choices=["32k", "40k", "48k"],
+ choices=["40k", "48k"],
value="40k",
interactive=True,
)
@@ -2174,7 +2305,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
[ckpt_path2, save_name, sr__, if_f0__, info___, version_1],
info7,
)
-
+
with gr.TabItem(i18n("Onnx导出")):
with gr.Row():
ckpt_dir = gr.Textbox(label=i18n("RVC模型路径"), value="", interactive=True)
@@ -2183,11 +2314,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
label=i18n("Onnx输出路径"), value="", interactive=True
)
with gr.Row():
- moevs = gr.Checkbox(label=i18n("MoeVS模型"), value=False,visible=False)
infoOnnx = gr.Label(label="info")
with gr.Row():
butOnnx = gr.Button(i18n("导出Onnx模型"), variant="primary")
- butOnnx.click(export_onnx, [ckpt_dir, onnx_dir, moevs], infoOnnx)
+ butOnnx.click(export_onnx, [ckpt_dir, onnx_dir], infoOnnx)
tab_faq = i18n("常见问题解答")
with gr.TabItem(tab_faq):
diff --git a/infer_pack/onnx_inference.py b/infer_pack/onnx_inference.py
index 09a4ed2..fb583a4 100644
--- a/infer_pack/onnx_inference.py
+++ b/infer_pack/onnx_inference.py
@@ -11,6 +11,8 @@ class ContentVec:
providers = ["CPUExecutionProvider"]
elif device == "cuda":
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+ elif device == "dml":
+ providers = ["DmlExecutionProvider"]
else:
raise RuntimeError("Unsportted Device")
self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
@@ -68,6 +70,8 @@ class OnnxRVC:
providers = ["CPUExecutionProvider"]
elif device == "cuda":
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+ elif device == "dml":
+ providers = ["DmlExecutionProvider"]
else:
raise RuntimeError("Unsportted Device")
self.model = onnxruntime.InferenceSession(model_path, providers=providers)
diff --git a/requirements.txt b/requirements.txt
index c533c38..43d4124 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,11 @@ librosa==0.9.1
llvmlite==0.39.0
fairseq==0.12.2
faiss-cpu==1.7.3
+<<<<<<< HEAD
gradio==3.28.1
+=======
+gradio==3.14.0
+>>>>>>> upstream/main
Cython
pydub>=0.25.1
soundfile>=0.12.1
diff --git a/train/utils.py b/train/utils.py
index 434ed7a..984c8c1 100644
--- a/train/utils.py
+++ b/train/utils.py
@@ -360,7 +360,10 @@ def get_hparams(init=True):
if not os.path.exists(experiment_dir):
os.makedirs(experiment_dir)
- config_path = "configs/%s.json" % args.sample_rate
+ if(args.version=="v1"or args.sample_rate=="40k"):
+ config_path = "configs/%s.json" % args.sample_rate
+ else:
+ config_path = "configs/%s_v2.json" % args.sample_rate
config_save_path = os.path.join(experiment_dir, "config.json")
if init:
with open(config_path, "r") as f:
diff --git a/train_nsf_sim_cache_sid_load_pretrain.py b/train_nsf_sim_cache_sid_load_pretrain.py
index 0d36fe6..2949bc4 100644
--- a/train_nsf_sim_cache_sid_load_pretrain.py
+++ b/train_nsf_sim_cache_sid_load_pretrain.py
@@ -192,7 +192,6 @@ def run(rank, n_gpus, hps):
epoch_str = 1
global_step = 0
if hps.pretrainG != "":
-
if rank == 0:
logger.info("loaded pretrained %s" % (hps.pretrainG))
print(
@@ -201,7 +200,6 @@ def run(rank, n_gpus, hps):
)
) ##测试不加载优化器
if hps.pretrainD != "":
-
if rank == 0:
logger.info("loaded pretrained %s" % (hps.pretrainD))
print(
diff --git a/trainset_preprocess_pipeline_print.py b/trainset_preprocess_pipeline_print.py
index 7d78dec..20a629d 100644
--- a/trainset_preprocess_pipeline_print.py
+++ b/trainset_preprocess_pipeline_print.py
@@ -54,7 +54,11 @@ class PreProcess:
os.makedirs(self.wavs16k_dir, exist_ok=True)
def norm_write(self, tmp_audio, idx0, idx1):
- tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (self.max * self.alpha)) + (
+ tmp_max = np.abs(tmp_audio).max()
+ if tmp_max > 2.5:
+ print("%s-%s-%s-filtered" % (idx0, idx1, tmp_max))
+ return
+ tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + (
1 - self.alpha
) * tmp_audio
wavfile.write(
diff --git a/vc_infer_pipeline.py b/vc_infer_pipeline.py
index d97f6fd..290b8f8 100644
--- a/vc_infer_pipeline.py
+++ b/vc_infer_pipeline.py
@@ -371,7 +371,7 @@ class VC(object):
with torch.no_grad():
logits = model.extract_features(**inputs)
feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
- if protect < 0.5:
+ if protect < 0.5 and pitch != None and pitchf != None:
feats0 = feats.clone()
if (
isinstance(index, type(None)) == False
@@ -398,7 +398,7 @@ class VC(object):
)
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
- if protect < 0.5:
+ if protect < 0.5 and pitch != None and pitchf != None:
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
0, 2, 1
)
@@ -410,7 +410,7 @@ class VC(object):
pitch = pitch[:, :p_len]
pitchf = pitchf[:, :p_len]
- if protect < 0.5:
+ if protect < 0.5 and pitch != None and pitchf != None:
pitchff = pitchf.clone()
pitchff[pitchf > 0] = 1
pitchff[pitchf < 1] = protect