Added CLI functionality for inference! No need to use the GUI for inference. Added an audio-outputs directory

This commit is contained in:
Mangio621
2023-05-11 03:32:09 +10:00
parent 0b3e29122e
commit a3ac337f6b
4 changed files with 210 additions and 9 deletions

View File

@@ -52,6 +52,7 @@ Special thanks to discord user @kalomaze#2983 for creating a temporary colab not
+ Added CLI functionality + Added CLI functionality
+ added train-index-cli.py to train the feature index without the GUI + added train-index-cli.py to train the feature index without the GUI
+ added extract-small-model.py to extract the small model without the GUI + added extract-small-model.py to extract the small model without the GUI
+ added infer-cli.py to do inference without the GUI.
## This repository has the following features too: ## This repository has the following features too:
+ Reduce tone leakage by replacing source feature to training-set feature using top1 retrieval; + Reduce tone leakage by replacing source feature to training-set feature using top1 retrieval;
@@ -193,6 +194,22 @@ python train-index-cli.py mi-test
python extract-small-model-cli.py logs/G_99750.pth MyModel 40k 1 "This is a cool model." python extract-small-model-cli.py logs/G_99750.pth MyModel 40k 1 "This is a cool model."
``` ```
## Inference without the GUI (Voice Conversion)
```bash
# + Mangio-RVC-Fork Feature. Infer audio with just the CLI
# Arguments
# arg1 = model name in weights folder. (mi-test.pth)
# arg2 = source file path (.wav)
# arg3 = output file name to be placed in ./audio-outputs (myoutput.wav).
# arg4 = feature index file path. (E:\added_IVF3042_Flat_nprobe_1.index)
# arg5 = speaker ID (0)
# arg6 = transposition. (12 = 12 semitones up)
# arg7 = f0 method. (harvest, pm, crepe, dio, crepe-tiny)
# arg8 = crepe hop length. Use 128. (applies to crepe f0 method only)
# arg9 = feature index ratio (0.78)
python infer-cli.py mi-test.pth E:\my-source-file.wav conversion_output.wav E:\added_IVF3042_Flat_nprobe_1.index 0 -2 pm 128 0.78
```
# Running the Tensorboard 📉 # Running the Tensorboard 📉
```bash ```bash
cd Mangio-RVC-Fork cd Mangio-RVC-Fork

View File

@@ -4,20 +4,21 @@ from multiprocessing import cpu_count
class Config: class Config:
def __init__(self): def __init__(self, is_gui=True):
self.device = "cuda:0" self.device = "cuda:0"
self.is_half = True self.is_half = True
self.n_cpu = 0 self.n_cpu = 0
self.gpu_name = None self.gpu_name = None
self.gpu_mem = None self.gpu_mem = None
( if(is_gui):
self.python_cmd, (
self.listen_port, self.python_cmd,
self.iscolab, self.listen_port,
self.noparallel, self.iscolab,
self.noautoopen, self.noparallel,
self.paperspace, self.noautoopen,
) = self.arg_parse() self.paperspace,
) = self.arg_parse()
self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()

View File

@@ -1,3 +1,5 @@
# Fork Feature Mangio RVC Fork. Extract the small model from a checkpoint with CLI.
import sys import sys
from train.process_ckpt import extract_small_model from train.process_ckpt import extract_small_model

181
infer-cli.py Normal file
View File

@@ -0,0 +1,181 @@
# Fork Feature Mangio RVC Fork. Infer Audio with just the CLI
import torch, os, traceback, sys, warnings, shutil, numpy as np
from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
from fairseq import checkpoint_utils
from vc_infer_pipeline import VC
from config import Config
from my_utils import load_audio
# Fork Feature. Write an audio file
from scipy.io.wavfile import write
config = Config(is_gui=False)
weight_root = 'weights'
n_spk = None # Set from get_vc
tgt_sr = 0 # Set from get_vc
net_g = None # Set from get_vc
vc = None # Set from get_vc
cpt = None # Set from get_vc
hubert_model = None # Set from vc_single
def get_hubert():
models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
["hubert_base.pt"],
suffix="",
)
hubert_model = models[0]
hubert_model = hubert_model.to(config.device)
if config.is_half:
hubert_model = hubert_model.half()
else:
hubert_model = hubert_model.float()
hubert_model.eval()
return hubert_model
def get_vc(sid):
global n_spk, tgt_sr, net_g, vc, cpt
if sid == []:
global hubert_model
if hubert_model != None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
print("clean_empty_cache")
del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt
hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
if torch.cuda.is_available():
torch.cuda.empty_cache()
###楼下不这么折腾清理不干净
if_f0 = cpt.get("f0", 1)
if if_f0 == 1:
net_g = SynthesizerTrnMs256NSFsid(
*cpt["config"], is_half=config.is_half
)
else:
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
del net_g, cpt
if torch.cuda.is_available():
torch.cuda.empty_cache()
cpt = None
return {"visible": False, "__type__": "update"}
person = "%s/%s" % (weight_root, sid)
print("loading %s" % person)
cpt = torch.load(person, map_location="cpu")
tgt_sr = cpt["config"][-1]
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
if_f0 = cpt.get("f0", 1)
if if_f0 == 1:
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
else:
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
del net_g.enc_q
print(net_g.load_state_dict(cpt["weight"], strict=False)) # 不加这一行清不干净, 真奇葩
net_g.eval().to(config.device)
if config.is_half:
net_g = net_g.half()
else:
net_g = net_g.float()
vc = VC(tgt_sr, config)
n_spk = cpt["config"][-3]
print("Mangio-RVC-Fork Infer-CLI: Model has been loaded...")
return {"visible": True, "maximum": n_spk, "__type__": "update"}
def vc_single(
sid,
input_audio,
f0_up_key,
f0_file,
f0_method,
file_index,
# file_big_npy,
index_rate,
crepe_hop_length,
): # spk_item, input_audio0, vc_transform0,f0_file,f0method0
global tgt_sr, net_g, vc, hubert_model, cpt
if input_audio is None:
return "You need to upload an audio", None
f0_up_key = int(f0_up_key)
try:
audio = load_audio(input_audio, 16000)
times = [0, 0, 0]
if hubert_model == None:
hubert_model = get_hubert()
if_f0 = cpt.get("f0", 1)
file_index = (
file_index.strip(" ")
.strip('"')
.strip("\n")
.strip('"')
.strip(" ")
.replace("trained", "added")
) # 防止小白写错,自动帮他替换掉
# file_big_npy = (
# file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
# )
audio_opt = vc.pipeline(
hubert_model,
net_g,
sid,
audio,
times,
f0_up_key,
f0_method,
file_index,
# file_big_npy,
index_rate,
if_f0,
crepe_hop_length,
f0_file=f0_file,
)
print(
"npy: ", times[0], "s, f0: ", times[1], "s, infer: ", times[2], "s", sep=""
)
return "Success", (tgt_sr, audio_opt)
except:
info = traceback.format_exc()
print(info)
return info, (None, None)
def start_inference():
# Get Essential Paths first
model_name = str(sys.argv[1]) # MyModel.pth
source_audio_path = str(sys.argv[2]) # Source Audio Path
output_file_name = str(sys.argv[3]) # Output audio path e.g outputs/conversion_out.wav
feature_index_path = str(sys.argv[4]) # Feature Index file path
f0_file = None # Not implemented yet. To be implemented later on
# Get parameters for inference
speaker_id = int(sys.argv[5]) # 0
transposition = float(sys.argv[6]) # 0.0 float
f0_method = str(sys.argv[7]) # harvest
crepe_hop_length = int(sys.argv[8]) # 128
feature_ratio = float(sys.argv[9]) # 0.78
# Get VC first. set global vc to VC from pipeline script
print("Mangio-RVC-Fork Infer-CLI: Starting the inference...")
vc_data = get_vc(model_name)
print(vc_data)
print("Mangio-RVC-Fork Infer-CLI: Performing inference...")
conversion_data = vc_single(
speaker_id,
source_audio_path,
transposition,
f0_file,
f0_method,
feature_index_path,
feature_ratio,
crepe_hop_length
)
if(conversion_data[0] == "Success"):
print("Mangio-RVC-Fork Infer-CLI: Inference succeeded. Writing to %s/%s..." % ('audio-outputs', output_file_name))
# Go ahead with output
write('%s/%s' % ('audio-outputs', output_file_name), conversion_data[1][0], conversion_data[1][1])
print("Mangio-RVC-Fork Infer-CLI: Finished! Saved output to %s/%s" % ('audio-outputs', output_file_name))
else:
print("Mangio-RVC-Fork Infer-CLI: Inference failed. Here's the traceback: ")
print(conversion_data[0])
start_inference()