Added CLI functionality for inference! No need to use the GUI for inference. Added an audio-outputs directory

2025-12-16 11:37:44 +01:00 · 2023-05-11 03:32:09 +10:00
parent 0b3e29122e
commit a3ac337f6b
4 changed files with 210 additions and 9 deletions
--- a/README.md
+++ b/README.md
@@ -52,6 +52,7 @@ Special thanks to discord user @kalomaze#2983 for creating a temporary colab not
 + Added CLI functionality
  + added train-index-cli.py to train the feature index without the GUI
  + added extract-small-model.py to extract the small model without the GUI
  + added infer-cli.py to do inference without the GUI.
 ## This repository has the following features too:
 + Reduce tone leakage by replacing source feature to training-set feature using top1 retrieval;
@@ -193,6 +194,22 @@ python train-index-cli.py mi-test
 python extract-small-model-cli.py logs/G_99750.pth MyModel 40k 1 "This is a cool model."
 ```
 ## Inference without the GUI (Voice Conversion)
 ```bash
 # + Mangio-RVC-Fork Feature. Infer audio with just the CLI
 # Arguments
 # arg1 = model name in weights folder. (mi-test.pth)
 # arg2 = source file path (.wav)
 # arg3 = output file name to be placed in ./audio-outputs (myoutput.wav).
 # arg4 = feature index file path. (E:\added_IVF3042_Flat_nprobe_1.index)
 # arg5 = speaker ID (0)
 # arg6 = transposition. (12 = 12 semitones up)
 # arg7 = f0 method. (harvest, pm, crepe, dio, crepe-tiny)
 # arg8 = crepe hop length. Use 128. (applies to crepe f0 method only)
 # arg9 = feature index ratio (0.78)
 python infer-cli.py mi-test.pth E:\my-source-file.wav conversion_output.wav E:\added_IVF3042_Flat_nprobe_1.index 0 -2 pm 128 0.78
 ```
 # Running the Tensorboard 📉
 ```bash
 cd Mangio-RVC-Fork
--- a/config.py
+++ b/config.py
@@ -4,20 +4,21 @@ from multiprocessing import cpu_count
 class Config:
-    def __init__(self):
+    def __init__(self, is_gui=True):
        self.device = "cuda:0"
        self.is_half = True
        self.n_cpu = 0
        self.gpu_name = None
        self.gpu_mem = None
-        (
+        if(is_gui):
-            self.python_cmd,
+            (
-            self.listen_port,
+                self.python_cmd,
-            self.iscolab,
+                self.listen_port,
-            self.noparallel,
+                self.iscolab,
-            self.noautoopen,
+                self.noparallel,
-            self.paperspace,
+                self.noautoopen,
-        ) = self.arg_parse()
+                self.paperspace,
            ) = self.arg_parse()
        self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
--- a/extract-small-model-cli.py
+++ b/extract-small-model-cli.py
@@ -1,3 +1,5 @@
 # Fork Feature Mangio RVC Fork. Extract the small model from a checkpoint with CLI.
 import sys
 from train.process_ckpt import extract_small_model
--- a/infer-cli.py
+++ b/infer-cli.py
@@ -0,0 +1,181 @@
 # Fork Feature Mangio RVC Fork. Infer Audio with just the CLI
 import torch, os, traceback, sys, warnings, shutil, numpy as np
 from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono
 from fairseq import checkpoint_utils
 from vc_infer_pipeline import VC
 from config import Config
 from my_utils import load_audio
 # Fork Feature. Write an audio file
 from scipy.io.wavfile import write
 config = Config(is_gui=False)
 weight_root = 'weights'
 n_spk = None # Set from get_vc
 tgt_sr = 0 # Set from get_vc
 net_g = None # Set from get_vc
 vc = None # Set from get_vc
 cpt = None # Set from get_vc
 hubert_model = None # Set from vc_single
 def get_hubert():
    models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
        ["hubert_base.pt"],
        suffix="",
    )
    hubert_model = models[0]
    hubert_model = hubert_model.to(config.device)
    if config.is_half:
        hubert_model = hubert_model.half()
    else:
        hubert_model = hubert_model.float()
    hubert_model.eval()
    return hubert_model
 def get_vc(sid):
    global n_spk, tgt_sr, net_g, vc, cpt
    if sid == []:
        global hubert_model
        if hubert_model != None:  # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的
            print("clean_empty_cache")
            del net_g, n_spk, vc, hubert_model, tgt_sr  # ,cpt
            hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            ###楼下不这么折腾清理不干净
            if_f0 = cpt.get("f0", 1)
            if if_f0 == 1:
                net_g = SynthesizerTrnMs256NSFsid(
                    *cpt["config"], is_half=config.is_half
                )
            else:
                net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
            del net_g, cpt
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            cpt = None
        return {"visible": False, "__type__": "update"}
    person = "%s/%s" % (weight_root, sid)
    print("loading %s" % person)
    cpt = torch.load(person, map_location="cpu")
    tgt_sr = cpt["config"][-1]
    cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
    if_f0 = cpt.get("f0", 1)
    if if_f0 == 1:
        net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
    else:
        net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
    del net_g.enc_q
    print(net_g.load_state_dict(cpt["weight"], strict=False))  # 不加这一行清不干净, 真奇葩
    net_g.eval().to(config.device)
    if config.is_half:
        net_g = net_g.half()
    else:
        net_g = net_g.float()
    vc = VC(tgt_sr, config)
    n_spk = cpt["config"][-3]
    print("Mangio-RVC-Fork Infer-CLI: Model has been loaded...")
    return {"visible": True, "maximum": n_spk, "__type__": "update"}
 def vc_single(
    sid,
    input_audio,
    f0_up_key,
    f0_file,
    f0_method,
    file_index,
    # file_big_npy,
    index_rate,
    crepe_hop_length,
 ):  # spk_item, input_audio0, vc_transform0,f0_file,f0method0
    global tgt_sr, net_g, vc, hubert_model, cpt
    if input_audio is None:
        return "You need to upload an audio", None
    f0_up_key = int(f0_up_key)
    try:
        audio = load_audio(input_audio, 16000)
        times = [0, 0, 0]
        if hubert_model == None:
            hubert_model = get_hubert()
        if_f0 = cpt.get("f0", 1)
        file_index = (
            file_index.strip(" ")
            .strip('"')
            .strip("\n")
            .strip('"')
            .strip(" ")
            .replace("trained", "added")
        )  # 防止小白写错，自动帮他替换掉
        # file_big_npy = (
        #     file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
        # )
        audio_opt = vc.pipeline(
            hubert_model,
            net_g,
            sid,
            audio,
            times,
            f0_up_key,
            f0_method,
            file_index,
            # file_big_npy,
            index_rate,
            if_f0,
            crepe_hop_length,
            f0_file=f0_file,
        )
        print(
            "npy: ", times[0], "s, f0: ", times[1], "s, infer: ", times[2], "s", sep=""
        )
        return "Success", (tgt_sr, audio_opt)
    except:
        info = traceback.format_exc()
        print(info)
        return info, (None, None)
 def start_inference():
    # Get Essential Paths first
    model_name = str(sys.argv[1]) # MyModel.pth
    source_audio_path = str(sys.argv[2]) # Source Audio Path
    output_file_name = str(sys.argv[3]) # Output audio path e.g outputs/conversion_out.wav
    feature_index_path = str(sys.argv[4]) # Feature Index file path
    f0_file = None # Not implemented yet. To be implemented later on
    # Get parameters for inference
    speaker_id = int(sys.argv[5]) # 0
    transposition = float(sys.argv[6]) # 0.0 float
    f0_method = str(sys.argv[7]) # harvest
    crepe_hop_length = int(sys.argv[8]) # 128
    feature_ratio = float(sys.argv[9]) # 0.78
    # Get VC first. set global vc to VC from pipeline script
    print("Mangio-RVC-Fork Infer-CLI: Starting the inference...")
    vc_data = get_vc(model_name)
    print(vc_data)
    print("Mangio-RVC-Fork Infer-CLI: Performing inference...")
    conversion_data = vc_single(
        speaker_id,
        source_audio_path,
        transposition,
        f0_file,
        f0_method,
        feature_index_path,
        feature_ratio,
        crepe_hop_length
    )
    if(conversion_data[0] == "Success"):
        print("Mangio-RVC-Fork Infer-CLI: Inference succeeded. Writing to %s/%s..." % ('audio-outputs', output_file_name))
        # Go ahead with output
        write('%s/%s' % ('audio-outputs', output_file_name), conversion_data[1][0], conversion_data[1][1])
        print("Mangio-RVC-Fork Infer-CLI: Finished! Saved output to %s/%s" % ('audio-outputs', output_file_name))
    else:
        print("Mangio-RVC-Fork Infer-CLI: Inference failed. Here's the traceback: ")
        print(conversion_data[0])
 start_inference()