Merge pull request #49 from Mangio621/TRUE-FIXED-FP16

Fix training times by fixing fp16 detection
This commit is contained in:
kalomaze
2023-07-22 22:48:22 -05:00
committed by GitHub
8 changed files with 673 additions and 424 deletions

View File

@@ -1,19 +1,73 @@
import argparse import argparse
import sys import sys
import torch import torch
import json
from multiprocessing import cpu_count from multiprocessing import cpu_count
global usefp16
usefp16 = False
def use_fp32_config(): def use_fp32_config():
for config_file in ["32k.json", "40k.json", "48k.json"]: usefp16 = False
with open(f"configs/{config_file}", "r") as f: device_capability = 0
strr = f.read().replace("true", "false") if torch.cuda.is_available():
with open(f"configs/{config_file}", "w") as f: device = torch.device("cuda:0") # Assuming you have only one GPU (index 0).
f.write(strr) device_capability = torch.cuda.get_device_capability(device)[0]
with open("trainset_preprocess_pipeline_print.py", "r") as f: if device_capability >= 7:
strr = f.read().replace("3.7", "3.0") usefp16 = True
with open("trainset_preprocess_pipeline_print.py", "w") as f: for config_file in ["32k.json", "40k.json", "48k.json"]:
f.write(strr) with open(f"configs/{config_file}", "r") as d:
data = json.load(d)
if "train" in data and "fp16_run" in data["train"]:
data["train"]["fp16_run"] = True
with open(f"configs/{config_file}", "w") as d:
json.dump(data, d, indent=4)
print(f"Set fp16_run to true in {config_file}")
with open(
"trainset_preprocess_pipeline_print.py", "r", encoding="utf-8"
) as f:
strr = f.read()
strr = strr.replace("3.0", "3.7")
with open(
"trainset_preprocess_pipeline_print.py", "w", encoding="utf-8"
) as f:
f.write(strr)
else:
for config_file in ["32k.json", "40k.json", "48k.json"]:
with open(f"configs/{config_file}", "r") as f:
data = json.load(f)
if "train" in data and "fp16_run" in data["train"]:
data["train"]["fp16_run"] = False
with open(f"configs/{config_file}", "w") as d:
json.dump(data, d, indent=4)
print(f"Set fp16_run to false in {config_file}")
with open(
"trainset_preprocess_pipeline_print.py", "r", encoding="utf-8"
) as f:
strr = f.read()
strr = strr.replace("3.7", "3.0")
with open(
"trainset_preprocess_pipeline_print.py", "w", encoding="utf-8"
) as f:
f.write(strr)
else:
print(
"CUDA is not available. Make sure you have an NVIDIA GPU and CUDA installed."
)
return (usefp16, device_capability)
class Config: class Config:
@@ -32,7 +86,7 @@ class Config:
self.paperspace, self.paperspace,
self.is_cli, self.is_cli,
) = self.arg_parse() ) = self.arg_parse()
self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
@staticmethod @staticmethod
@@ -50,11 +104,15 @@ class Config:
action="store_true", action="store_true",
help="Do not open in browser automatically", help="Do not open in browser automatically",
) )
parser.add_argument( # Fork Feature. Paperspace integration for web UI parser.add_argument( # Fork Feature. Paperspace integration for web UI
"--paperspace", action="store_true", help="Note that this argument just shares a gradio link for the web UI. Thus can be used on other non-local CLI systems." "--paperspace",
action="store_true",
help="Note that this argument just shares a gradio link for the web UI. Thus can be used on other non-local CLI systems.",
) )
parser.add_argument( # Fork Feature. Embed a CLI into the infer-web.py parser.add_argument( # Fork Feature. Embed a CLI into the infer-web.py
"--is_cli", action="store_true", help="Use the CLI instead of setting up a gradio UI. This flag will launch an RVC text interface where you can execute functions from infer-web.py!" "--is_cli",
action="store_true",
help="Use the CLI instead of setting up a gradio UI. This flag will launch an RVC text interface where you can execute functions from infer-web.py!",
) )
cmd_opts = parser.parse_args() cmd_opts = parser.parse_args()
@@ -95,9 +153,9 @@ class Config:
): ):
print("Found GPU", self.gpu_name, ", force to fp32") print("Found GPU", self.gpu_name, ", force to fp32")
self.is_half = False self.is_half = False
use_fp32_config()
else: else:
print("Found GPU", self.gpu_name) print("Found GPU", self.gpu_name)
use_fp32_config()
self.gpu_mem = int( self.gpu_mem = int(
torch.cuda.get_device_properties(i_device).total_memory torch.cuda.get_device_properties(i_device).total_memory
/ 1024 / 1024

View File

@@ -5,10 +5,10 @@ sys.path.append(now_dir)
from my_utils import load_audio from my_utils import load_audio
import pyworld import pyworld
import numpy as np, logging import numpy as np, logging
import torchcrepe # Fork Feature. Crepe algo for training and preprocess import torchcrepe # Fork Feature. Crepe algo for training and preprocess
import torch import torch
from torch import Tensor # Fork Feature. Used for pitch prediction for torch crepe. from torch import Tensor # Fork Feature. Used for pitch prediction for torch crepe.
import scipy.signal as signal # Fork Feature hybrid inference import scipy.signal as signal # Fork Feature hybrid inference
import tqdm import tqdm
logging.getLogger("numba").setLevel(logging.WARNING) logging.getLogger("numba").setLevel(logging.WARNING)
@@ -19,9 +19,9 @@ f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
DoFormant = False DoFormant = False
with open('formanting.txt', 'r') as fvf: with open("formanting.txt", "r") as fvf:
content = fvf.readlines() content = fvf.readlines()
Quefrency, Timbre = content[1].split('\n')[0], content[2].split('\n')[0] Quefrency, Timbre = content[1].split("\n")[0], content[2].split("\n")[0]
def printt(strr): def printt(strr):
@@ -32,7 +32,7 @@ def printt(strr):
n_p = int(sys.argv[2]) n_p = int(sys.argv[2])
f0method = sys.argv[3] f0method = sys.argv[3]
extraction_crepe_hop_length = 0 extraction_crepe_hop_length = 0
try: try:
extraction_crepe_hop_length = int(sys.argv[4]) extraction_crepe_hop_length = int(sys.argv[4])
except: except:
@@ -53,11 +53,11 @@ class FeatureInput(object):
self.f0_min = 50.0 self.f0_min = 50.0
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
# EXPERIMENTAL. PROBABLY BUGGY # EXPERIMENTAL. PROBABLY BUGGY
def get_f0_hybrid_computation( def get_f0_hybrid_computation(
self, self,
methods_str, methods_str,
x, x,
f0_min, f0_min,
f0_max, f0_max,
@@ -67,9 +67,9 @@ class FeatureInput(object):
): ):
# Get various f0 methods from input to use in the computation stack # Get various f0 methods from input to use in the computation stack
s = methods_str s = methods_str
s = s.split('hybrid')[1] s = s.split("hybrid")[1]
s = s.replace('[', '').replace(']', '') s = s.replace("[", "").replace("]", "")
methods = s.split('+') methods = s.split("+")
f0_computation_stack = [] f0_computation_stack = []
print("Calculating f0 pitch estimations for methods: %s" % str(methods)) print("Calculating f0 pitch estimations for methods: %s" % str(methods))
@@ -99,7 +99,9 @@ class FeatureInput(object):
torch_device_index = 0 torch_device_index = 0
torch_device = None torch_device = None
if torch.cuda.is_available(): if torch.cuda.is_available():
torch_device = torch.device(f"cuda:{torch_device_index % torch.cuda.device_count()}") torch_device = torch.device(
f"cuda:{torch_device_index % torch.cuda.device_count()}"
)
elif torch.backends.mps.is_available(): elif torch.backends.mps.is_available():
torch_device = torch.device("mps") torch_device = torch.device("mps")
else: else:
@@ -123,7 +125,7 @@ class FeatureInput(object):
f0 = torchcrepe.filter.mean(f0, 3) f0 = torchcrepe.filter.mean(f0, 3)
f0[pd < 0.1] = 0 f0[pd < 0.1] = 0
f0 = f0[0].cpu().numpy() f0 = f0[0].cpu().numpy()
f0 = f0[1:] # Get rid of extra first frame f0 = f0[1:] # Get rid of extra first frame
elif method == "mangio-crepe": elif method == "mangio-crepe":
# print("Performing crepe pitch extraction. (EXPERIMENTAL)") # print("Performing crepe pitch extraction. (EXPERIMENTAL)")
# print("CREPE PITCH EXTRACTION HOP LENGTH: " + str(crepe_hop_length)) # print("CREPE PITCH EXTRACTION HOP LENGTH: " + str(crepe_hop_length))
@@ -132,7 +134,9 @@ class FeatureInput(object):
torch_device_index = 0 torch_device_index = 0
torch_device = None torch_device = None
if torch.cuda.is_available(): if torch.cuda.is_available():
torch_device = torch.device(f"cuda:{torch_device_index % torch.cuda.device_count()}") torch_device = torch.device(
f"cuda:{torch_device_index % torch.cuda.device_count()}"
)
elif torch.backends.mps.is_available(): elif torch.backends.mps.is_available():
torch_device = torch.device("mps") torch_device = torch.device("mps")
else: else:
@@ -156,7 +160,7 @@ class FeatureInput(object):
"full", "full",
batch_size=crepe_hop_length * 2, batch_size=crepe_hop_length * 2,
device=torch_device, device=torch_device,
pad=True pad=True,
) )
p_len = p_len or x.shape[0] // crepe_hop_length p_len = p_len or x.shape[0] // crepe_hop_length
# Resize the pitch # Resize the pitch
@@ -165,7 +169,7 @@ class FeatureInput(object):
target = np.interp( target = np.interp(
np.arange(0, len(source) * p_len, len(source)) / p_len, np.arange(0, len(source) * p_len, len(source)) / p_len,
np.arange(0, len(source)), np.arange(0, len(source)),
source source,
) )
f0 = np.nan_to_num(target) f0 = np.nan_to_num(target)
elif method == "harvest": elif method == "harvest":
@@ -191,12 +195,12 @@ class FeatureInput(object):
f0 = signal.medfilt(f0, 3) f0 = signal.medfilt(f0, 3)
f0 = f0[1:] f0 = f0[1:]
f0_computation_stack.append(f0) f0_computation_stack.append(f0)
for fc in f0_computation_stack: for fc in f0_computation_stack:
print(len(fc)) print(len(fc))
# print("Calculating hybrid median f0 from the stack of: %s" % str(methods)) # print("Calculating hybrid median f0 from the stack of: %s" % str(methods))
f0_median_hybrid = None f0_median_hybrid = None
if len(f0_computation_stack) == 1: if len(f0_computation_stack) == 1:
f0_median_hybrid = f0_computation_stack[0] f0_median_hybrid = f0_computation_stack[0]
@@ -236,10 +240,9 @@ class FeatureInput(object):
elif f0_method == "rmvpe": elif f0_method == "rmvpe":
if hasattr(self, "model_rmvpe") == False: if hasattr(self, "model_rmvpe") == False:
from rmvpe import RMVPE from rmvpe import RMVPE
print("loading rmvpe model") print("loading rmvpe model")
self.model_rmvpe = RMVPE( self.model_rmvpe = RMVPE("rmvpe.pt", is_half=False, device="cuda:0")
"rmvpe.pt", is_half=False, device="cuda:0"
)
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
elif f0_method == "dio": elif f0_method == "dio":
f0, t = pyworld.dio( f0, t = pyworld.dio(
@@ -250,12 +253,16 @@ class FeatureInput(object):
frame_period=1000 * self.hop / self.fs, frame_period=1000 * self.hop / self.fs,
) )
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs) f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
elif f0_method == "crepe": # Fork Feature: Added crepe f0 for f0 feature extraction elif (
f0_method == "crepe"
): # Fork Feature: Added crepe f0 for f0 feature extraction
# Pick a batch size that doesn't cause memory errors on your gpu # Pick a batch size that doesn't cause memory errors on your gpu
torch_device_index = 0 torch_device_index = 0
torch_device = None torch_device = None
if torch.cuda.is_available(): if torch.cuda.is_available():
torch_device = torch.device(f"cuda:{torch_device_index % torch.cuda.device_count()}") torch_device = torch.device(
f"cuda:{torch_device_index % torch.cuda.device_count()}"
)
elif torch.backends.mps.is_available(): elif torch.backends.mps.is_available():
torch_device = torch.device("mps") torch_device = torch.device("mps")
else: else:
@@ -287,7 +294,9 @@ class FeatureInput(object):
torch_device_index = 0 torch_device_index = 0
torch_device = None torch_device = None
if torch.cuda.is_available(): if torch.cuda.is_available():
torch_device = torch.device(f"cuda:{torch_device_index % torch.cuda.device_count()}") torch_device = torch.device(
f"cuda:{torch_device_index % torch.cuda.device_count()}"
)
elif torch.backends.mps.is_available(): elif torch.backends.mps.is_available():
torch_device = torch.device("mps") torch_device = torch.device("mps")
else: else:
@@ -311,7 +320,7 @@ class FeatureInput(object):
"full", "full",
batch_size=crepe_hop_length * 2, batch_size=crepe_hop_length * 2,
device=torch_device, device=torch_device,
pad=True pad=True,
) )
p_len = p_len or x.shape[0] // crepe_hop_length p_len = p_len or x.shape[0] // crepe_hop_length
# Resize the pitch # Resize the pitch
@@ -320,20 +329,20 @@ class FeatureInput(object):
target = np.interp( target = np.interp(
np.arange(0, len(source) * p_len, len(source)) / p_len, np.arange(0, len(source) * p_len, len(source)) / p_len,
np.arange(0, len(source)), np.arange(0, len(source)),
source source,
) )
f0 = np.nan_to_num(target) f0 = np.nan_to_num(target)
elif "hybrid" in f0_method: # EXPERIMENTAL elif "hybrid" in f0_method: # EXPERIMENTAL
# Perform hybrid median pitch estimation # Perform hybrid median pitch estimation
time_step = 160 / 16000 * 1000 time_step = 160 / 16000 * 1000
f0 = self.get_f0_hybrid_computation( f0 = self.get_f0_hybrid_computation(
f0_method, f0_method,
x, x,
self.f0_min, self.f0_min,
self.f0_max, self.f0_max,
p_len, p_len,
crepe_hop_length, crepe_hop_length,
time_step time_step,
) )
# Mangio-RVC-Fork Feature: Add hybrid f0 inference to feature extraction. EXPERIMENTAL... # Mangio-RVC-Fork Feature: Add hybrid f0 inference to feature extraction. EXPERIMENTAL...
@@ -362,14 +371,19 @@ class FeatureInput(object):
with tqdm.tqdm(total=len(paths), leave=True, position=thread_n) as pbar: with tqdm.tqdm(total=len(paths), leave=True, position=thread_n) as pbar:
for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths): for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
try: try:
pbar.set_description("thread:%s, f0ing, Hop-Length:%s" % (thread_n, crepe_hop_length)) pbar.set_description(
"thread:%s, f0ing, Hop-Length:%s"
% (thread_n, crepe_hop_length)
)
pbar.update(1) pbar.update(1)
if ( if (
os.path.exists(opt_path1 + ".npy") == True os.path.exists(opt_path1 + ".npy") == True
and os.path.exists(opt_path2 + ".npy") == True and os.path.exists(opt_path2 + ".npy") == True
): ):
continue continue
featur_pit = self.compute_f0(inp_path, f0_method, crepe_hop_length) featur_pit = self.compute_f0(
inp_path, f0_method, crepe_hop_length
)
np.save( np.save(
opt_path2, opt_path2,
featur_pit, featur_pit,
@@ -382,7 +396,9 @@ class FeatureInput(object):
allow_pickle=False, allow_pickle=False,
) # ori ) # ori
except: except:
printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc())) printt(
"f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc())
)
if __name__ == "__main__": if __name__ == "__main__":
@@ -411,12 +427,7 @@ if __name__ == "__main__":
for i in range(n_p): for i in range(n_p):
p = Process( p = Process(
target=featureInput.go, target=featureInput.go,
args=( args=(paths[i::n_p], f0method, extraction_crepe_hop_length, i),
paths[i::n_p],
f0method,
extraction_crepe_hop_length,
i
),
) )
ps.append(p) ps.append(p)
p.start() p.start()

View File

@@ -51,8 +51,10 @@ class RVC:
self.window = 160 self.window = 160
# Get Torch Device # Get Torch Device
if(torch.cuda.is_available()): if torch.cuda.is_available():
self.torch_device = torch.device(f"cuda:{0 % torch.cuda.device_count()}") self.torch_device = torch.device(
f"cuda:{0 % torch.cuda.device_count()}"
)
elif torch.backends.mps.is_available(): elif torch.backends.mps.is_available():
self.torch_device = torch.device("mps") self.torch_device = torch.device("mps")
else: else:
@@ -141,7 +143,7 @@ class RVC:
def get_f0(self, x, f0_up_key, inp_f0=None): def get_f0(self, x, f0_up_key, inp_f0=None):
# Calculate Padding and f0 details here # Calculate Padding and f0 details here
p_len = x.shape[0] // 512 # For Now This probs doesn't work p_len = x.shape[0] // 512 # For Now This probs doesn't work
x_pad = 1 x_pad = 1
f0_min = 50 f0_min = 50
f0_max = 1100 f0_max = 1100
@@ -150,11 +152,11 @@ class RVC:
f0 = 0 f0 = 0
# Here, check f0_methods and get their computations # Here, check f0_methods and get their computations
if(self.f0_method == 'harvest'): if self.f0_method == "harvest":
f0 = self.get_harvest_computation(x, f0_min, f0_max) f0 = self.get_harvest_computation(x, f0_min, f0_max)
elif(self.f0_method == 'reg-crepe'): elif self.f0_method == "reg-crepe":
f0 = self.get_regular_crepe_computation(x, f0_min, f0_max) f0 = self.get_regular_crepe_computation(x, f0_min, f0_max)
elif(self.f0_method == 'reg-crepe-tiny'): elif self.f0_method == "reg-crepe-tiny":
f0 = self.get_regular_crepe_computation(x, f0_min, f0_max, "tiny") f0 = self.get_regular_crepe_computation(x, f0_min, f0_max, "tiny")
# Calculate f0_course and f0_bak here # Calculate f0_course and f0_bak here
@@ -300,7 +302,7 @@ class GUI:
with open("values1.json", "r") as j: with open("values1.json", "r") as j:
data = json.load(j) data = json.load(j)
except: except:
# Injecting f0_method into the json data # Injecting f0_method into the json data
with open("values1.json", "w") as j: with open("values1.json", "w") as j:
data = { data = {
"pth_path": "", "pth_path": "",
@@ -328,11 +330,7 @@ class GUI:
[ [
sg.Frame( sg.Frame(
title="Proudly forked by Mangio621", title="Proudly forked by Mangio621",
layout=[ layout=[[sg.Image("./mangio_utils/lol.png")]],
[
sg.Image('./mangio_utils/lol.png')
]
]
), ),
sg.Frame( sg.Frame(
title=i18n("加载模型"), title=i18n("加载模型"),
@@ -384,14 +382,16 @@ class GUI:
), ),
], ],
], ],
) ),
], ],
[ [
# Mangio f0 Selection frame Here # Mangio f0 Selection frame Here
sg.Frame( sg.Frame(
layout=[ layout=[
[ [
sg.Radio("Harvest", "f0_method", key="harvest", default=True), sg.Radio(
"Harvest", "f0_method", key="harvest", default=True
),
sg.Radio("Crepe", "f0_method", key="reg-crepe"), sg.Radio("Crepe", "f0_method", key="reg-crepe"),
sg.Radio("Crepe Tiny", "f0_method", key="reg-crepe-tiny"), sg.Radio("Crepe Tiny", "f0_method", key="reg-crepe-tiny"),
] ]
@@ -536,20 +536,21 @@ class GUI:
if event == "stop_vc" and self.flag_vc == True: if event == "stop_vc" and self.flag_vc == True:
self.flag_vc = False self.flag_vc = False
# Function that returns the used f0 method in string format "harvest" # Function that returns the used f0 method in string format "harvest"
def get_f0_method_from_radios(self, values): def get_f0_method_from_radios(self, values):
f0_array = [ f0_array = [
{"name": "harvest", "val": values['harvest']}, {"name": "harvest", "val": values["harvest"]},
{"name": "reg-crepe", "val": values['reg-crepe']}, {"name": "reg-crepe", "val": values["reg-crepe"]},
{"name": "reg-crepe-tiny", "val": values['reg-crepe-tiny']}, {"name": "reg-crepe-tiny", "val": values["reg-crepe-tiny"]},
] ]
# Filter through to find a true value # Filter through to find a true value
used_f0 = "" used_f0 = ""
for f0 in f0_array: for f0 in f0_array:
if(f0['val'] == True): if f0["val"] == True:
used_f0 = f0['name'] used_f0 = f0["name"]
break break
if(used_f0 == ""): used_f0 = "harvest" # Default Harvest if used_f0 is empty somehow if used_f0 == "":
used_f0 = "harvest" # Default Harvest if used_f0 is empty somehow
return used_f0 return used_f0
def set_values(self, values): def set_values(self, values):

File diff suppressed because it is too large Load Diff

View File

@@ -1,11 +1,14 @@
import ffmpeg import ffmpeg
import numpy as np import numpy as np
#import praatio
#import praatio.praat_scripts
import os
#from os.path import join
#praatEXE = join('.',os.path.abspath(os.getcwd()) + r"\Praat.exe") # import praatio
# import praatio.praat_scripts
import os
# from os.path import join
# praatEXE = join('.',os.path.abspath(os.getcwd()) + r"\Praat.exe")
def load_audio(file, sr, DoFormant, Quefrency, Timbre): def load_audio(file, sr, DoFormant, Quefrency, Timbre):
try: try:
@@ -15,43 +18,47 @@ def load_audio(file, sr, DoFormant, Quefrency, Timbre):
file = ( file = (
file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
) # 防止小白拷路径头尾带了空格和"和回车 ) # 防止小白拷路径头尾带了空格和"和回车
file_formanted = ( file_formanted = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") with open("formanting.txt", "r") as fvf:
)
with open('formanting.txt', 'r') as fvf:
content = fvf.readlines() content = fvf.readlines()
if 'True' in content[0].split('\n')[0]: if "True" in content[0].split("\n")[0]:
#print("true") # print("true")
DoFormant = True DoFormant = True
Quefrency, Timbre = content[1].split('\n')[0], content[2].split('\n')[0] Quefrency, Timbre = content[1].split("\n")[0], content[2].split("\n")[0]
else: else:
#print("not true") # print("not true")
DoFormant = False DoFormant = False
if DoFormant: if DoFormant:
#os.system(f"stftpitchshift -i {file} -q {Quefrency} -t {Timbre} -o {file_formanted}") # os.system(f"stftpitchshift -i {file} -q {Quefrency} -t {Timbre} -o {file_formanted}")
#print('stftpitchshift -i "%s" -p 1.0 --rms -w 128 -v 8 -q %s -t %s -o "%s"' % (file, Quefrency, Timbre, file_formanted)) # print('stftpitchshift -i "%s" -p 1.0 --rms -w 128 -v 8 -q %s -t %s -o "%s"' % (file, Quefrency, Timbre, file_formanted))
print("formanting...") print("formanting...")
os.system('stftpitchshift -i "%s" -q %s -t %s -o "%sFORMANTED"' % (file, Quefrency, Timbre, file_formanted)) os.system(
'stftpitchshift -i "%s" -q %s -t %s -o "%sFORMANTED"'
% (file, Quefrency, Timbre, file_formanted)
)
print("formanted!") print("formanted!")
#filepraat = (os.path.abspath(os.getcwd()) + '\\' + file).replace('/','\\') # filepraat = (os.path.abspath(os.getcwd()) + '\\' + file).replace('/','\\')
#file_formantedpraat = ('"' + os.path.abspath(os.getcwd()) + '/' + 'formanted'.join(file_formanted) + '"').replace('/','\\') # file_formantedpraat = ('"' + os.path.abspath(os.getcwd()) + '/' + 'formanted'.join(file_formanted) + '"').replace('/','\\')
out, _ = ( out, _ = (
ffmpeg.input('%sFORMANTED%s' % (file_formanted, '.wav'), threads=0) ffmpeg.input("%sFORMANTED%s" % (file_formanted, ".wav"), threads=0)
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) .run(
cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True
)
) )
os.remove('%sFORMANTED%s' % (file_formanted, '.wav')) os.remove("%sFORMANTED%s" % (file_formanted, ".wav"))
else: else:
out, _ = ( out, _ = (
ffmpeg.input(file, threads=0) ffmpeg.input(file, threads=0)
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) .run(
cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True
)
) )
except Exception as e: except Exception as e:
raise RuntimeError(f"Failed to load audio: {e}") raise RuntimeError(f"Failed to load audio: {e}")

View File

@@ -568,10 +568,10 @@ def train_and_evaluate(
), ),
) )
) )
with open("stop.txt", "r+") as tostop: with open("stop.txt", "r+") as tostop:
content = tostop.read() content = tostop.read()
if 'stop' in content: if "stop" in content:
logger.info("Stop Button was pressed. The program is closed.") logger.info("Stop Button was pressed. The program is closed.")
if hasattr(net_g, "module"): if hasattr(net_g, "module"):
ckpt = net_g.module.state_dict() ckpt = net_g.module.state_dict()
@@ -581,15 +581,21 @@ def train_and_evaluate(
"saving final ckpt:%s" "saving final ckpt:%s"
% ( % (
savee( savee(
ckpt, hps.sample_rate, hps.if_f0, hps.name, epoch, hps.version, hps ckpt,
hps.sample_rate,
hps.if_f0,
hps.name,
epoch,
hps.version,
hps,
) )
) )
) )
tostop.truncate(0) tostop.truncate(0)
tostop.writelines("not") tostop.writelines("not")
os._exit(2333333) os._exit(2333333)
if rank == 0: if rank == 0:
logger.info("====> Epoch: {} {}".format(epoch, epoch_recorder.record())) logger.info("====> Epoch: {} {}".format(epoch, epoch_recorder.record()))
if epoch >= hps.total_epoch and rank == 0: if epoch >= hps.total_epoch and rank == 0:

View File

@@ -24,9 +24,10 @@ Timbre = 0.0
mutex = multiprocessing.Lock() mutex = multiprocessing.Lock()
f = open("%s/preprocess.log" % exp_dir, "a+") f = open("%s/preprocess.log" % exp_dir, "a+")
with open('formanting.txt', 'r') as fvf: with open("formanting.txt", "r") as fvf:
content = fvf.readlines() content = fvf.readlines()
Quefrency, Timbre = content[1].split('\n')[0], content[2].split('\n')[0] Quefrency, Timbre = content[1].split("\n")[0], content[2].split("\n")[0]
def println(strr): def println(strr):
mutex.acquire() mutex.acquire()
@@ -104,12 +105,14 @@ class PreProcess:
idx1 += 1 idx1 += 1
break break
self.norm_write(tmp_audio, idx0, idx1) self.norm_write(tmp_audio, idx0, idx1)
#println("%s->Suc." % path) # println("%s->Suc." % path)
except: except:
println("%s->%s" % (path, traceback.format_exc())) println("%s->%s" % (path, traceback.format_exc()))
def pipeline_mp(self, infos, thread_n): def pipeline_mp(self, infos, thread_n):
for path, idx0 in tqdm.tqdm(infos, position=thread_n, leave=True, desc="thread:%s" % thread_n): for path, idx0 in tqdm.tqdm(
infos, position=thread_n, leave=True, desc="thread:%s" % thread_n
):
self.pipeline(path, idx0) self.pipeline(path, idx0)
def pipeline_mp_inp_dir(self, inp_root, n_p): def pipeline_mp_inp_dir(self, inp_root, n_p):

View File

@@ -1,7 +1,7 @@
import numpy as np, parselmouth, torch, pdb, sys, os import numpy as np, parselmouth, torch, pdb, sys, os
from time import time as ttime from time import time as ttime
import torch.nn.functional as F import torch.nn.functional as F
import torchcrepe # Fork feature. Use the crepe f0 algorithm. New dependency (pip install torchcrepe) import torchcrepe # Fork feature. Use the crepe f0 algorithm. New dependency (pip install torchcrepe)
from torch import Tensor from torch import Tensor
import scipy.signal as signal import scipy.signal as signal
import pyworld, os, traceback, faiss, librosa, torchcrepe import pyworld, os, traceback, faiss, librosa, torchcrepe
@@ -15,6 +15,7 @@ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
input_audio_path2wav = {} input_audio_path2wav = {}
@lru_cache @lru_cache
def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period): def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
audio = input_audio_path2wav[input_audio_path] audio = input_audio_path2wav[input_audio_path]
@@ -74,24 +75,28 @@ class VC(object):
def get_optimal_torch_device(self, index: int = 0) -> torch.device: def get_optimal_torch_device(self, index: int = 0) -> torch.device:
# Get cuda device # Get cuda device
if torch.cuda.is_available(): if torch.cuda.is_available():
return torch.device(f"cuda:{index % torch.cuda.device_count()}") # Very fast return torch.device(
f"cuda:{index % torch.cuda.device_count()}"
) # Very fast
elif torch.backends.mps.is_available(): elif torch.backends.mps.is_available():
return torch.device("mps") return torch.device("mps")
# Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
# Else wise return the "cpu" as a torch device, # Else wise return the "cpu" as a torch device,
return torch.device("cpu") return torch.device("cpu")
# Fork Feature: Compute f0 with the crepe method # Fork Feature: Compute f0 with the crepe method
def get_f0_crepe_computation( def get_f0_crepe_computation(
self, self,
x, x,
f0_min, f0_min,
f0_max, f0_max,
p_len, p_len,
hop_length=160, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time. hop_length=160, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
): ):
x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float. x = x.astype(
np.float32
) # fixes the F.conv2D exception. We needed to convert double to float.
x /= np.quantile(np.abs(x), 0.999) x /= np.quantile(np.abs(x), 0.999)
torch_device = self.get_optimal_torch_device() torch_device = self.get_optimal_torch_device()
audio = torch.from_numpy(x).to(torch_device, copy=True) audio = torch.from_numpy(x).to(torch_device, copy=True)
@@ -109,7 +114,7 @@ class VC(object):
model, model,
batch_size=hop_length * 2, batch_size=hop_length * 2,
device=torch_device, device=torch_device,
pad=True pad=True,
) )
p_len = p_len or x.shape[0] // hop_length p_len = p_len or x.shape[0] // hop_length
# Resize the pitch for final f0 # Resize the pitch for final f0
@@ -118,17 +123,17 @@ class VC(object):
target = np.interp( target = np.interp(
np.arange(0, len(source) * p_len, len(source)) / p_len, np.arange(0, len(source) * p_len, len(source)) / p_len,
np.arange(0, len(source)), np.arange(0, len(source)),
source source,
) )
f0 = np.nan_to_num(target) f0 = np.nan_to_num(target)
return f0 # Resized f0 return f0 # Resized f0
def get_f0_official_crepe_computation( def get_f0_official_crepe_computation(
self, self,
x, x,
f0_min, f0_min,
f0_max, f0_max,
model="full", model="full",
): ):
# Pick a batch size that doesn't cause memory errors on your gpu # Pick a batch size that doesn't cause memory errors on your gpu
batch_size = 512 batch_size = 512
@@ -153,15 +158,15 @@ class VC(object):
# Fork Feature: Compute pYIN f0 method # Fork Feature: Compute pYIN f0 method
def get_f0_pyin_computation(self, x, f0_min, f0_max): def get_f0_pyin_computation(self, x, f0_min, f0_max):
y, sr = librosa.load('saudio/Sidney.wav', self.sr, mono=True) y, sr = librosa.load("saudio/Sidney.wav", self.sr, mono=True)
f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=f0_min, fmax=f0_max) f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=f0_min, fmax=f0_max)
f0 = f0[1:] # Get rid of extra first frame f0 = f0[1:] # Get rid of extra first frame
return f0 return f0
# Fork Feature: Acquire median hybrid f0 estimation calculation # Fork Feature: Acquire median hybrid f0 estimation calculation
def get_f0_hybrid_computation( def get_f0_hybrid_computation(
self, self,
methods_str, methods_str,
input_audio_path, input_audio_path,
x, x,
f0_min, f0_min,
@@ -173,9 +178,9 @@ class VC(object):
): ):
# Get various f0 methods from input to use in the computation stack # Get various f0 methods from input to use in the computation stack
s = methods_str s = methods_str
s = s.split('hybrid')[1] s = s.split("hybrid")[1]
s = s.replace('[', '').replace(']', '') s = s.replace("[", "").replace("]", "")
methods = s.split('+') methods = s.split("+")
f0_computation_stack = [] f0_computation_stack = []
print("Calculating f0 pitch estimations for methods: %s" % str(methods)) print("Calculating f0 pitch estimations for methods: %s" % str(methods))
@@ -202,35 +207,39 @@ class VC(object):
) )
elif method == "crepe": elif method == "crepe":
f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max) f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max)
f0 = f0[1:] # Get rid of extra first frame f0 = f0[1:] # Get rid of extra first frame
elif method == "crepe-tiny": elif method == "crepe-tiny":
f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny") f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
f0 = f0[1:] # Get rid of extra first frame f0 = f0[1:] # Get rid of extra first frame
elif method == "mangio-crepe": elif method == "mangio-crepe":
f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length) f0 = self.get_f0_crepe_computation(
x, f0_min, f0_max, p_len, crepe_hop_length
)
elif method == "mangio-crepe-tiny": elif method == "mangio-crepe-tiny":
f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length, "tiny") f0 = self.get_f0_crepe_computation(
x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
)
elif method == "harvest": elif method == "harvest":
f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
if filter_radius > 2: if filter_radius > 2:
f0 = signal.medfilt(f0, 3) f0 = signal.medfilt(f0, 3)
f0 = f0[1:] # Get rid of first frame. f0 = f0[1:] # Get rid of first frame.
elif method == "dio": # Potentially buggy? elif method == "dio": # Potentially buggy?
f0, t = pyworld.dio( f0, t = pyworld.dio(
x.astype(np.double), x.astype(np.double),
fs=self.sr, fs=self.sr,
f0_ceil=f0_max, f0_ceil=f0_max,
f0_floor=f0_min, f0_floor=f0_min,
frame_period=10 frame_period=10,
) )
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr) f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
f0 = signal.medfilt(f0, 3) f0 = signal.medfilt(f0, 3)
f0 = f0[1:] f0 = f0[1:]
#elif method == "pyin": Not Working just yet # elif method == "pyin": Not Working just yet
# f0 = self.get_f0_pyin_computation(x, f0_min, f0_max) # f0 = self.get_f0_pyin_computation(x, f0_min, f0_max)
# Push method to the stack # Push method to the stack
f0_computation_stack.append(f0) f0_computation_stack.append(f0)
for fc in f0_computation_stack: for fc in f0_computation_stack:
print(len(fc)) print(len(fc))
@@ -280,13 +289,13 @@ class VC(object):
f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
if filter_radius > 2: if filter_radius > 2:
f0 = signal.medfilt(f0, 3) f0 = signal.medfilt(f0, 3)
elif f0_method == "dio": # Potentially Buggy? elif f0_method == "dio": # Potentially Buggy?
f0, t = pyworld.dio( f0, t = pyworld.dio(
x.astype(np.double), x.astype(np.double),
fs=self.sr, fs=self.sr,
f0_ceil=f0_max, f0_ceil=f0_max,
f0_floor=f0_min, f0_floor=f0_min,
frame_period=10 frame_period=10,
) )
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr) f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
f0 = signal.medfilt(f0, 3) f0 = signal.medfilt(f0, 3)
@@ -295,12 +304,17 @@ class VC(object):
elif f0_method == "crepe-tiny": elif f0_method == "crepe-tiny":
f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny") f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
elif f0_method == "mangio-crepe": elif f0_method == "mangio-crepe":
f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length) f0 = self.get_f0_crepe_computation(
x, f0_min, f0_max, p_len, crepe_hop_length
)
elif f0_method == "mangio-crepe-tiny": elif f0_method == "mangio-crepe-tiny":
f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length, "tiny") f0 = self.get_f0_crepe_computation(
x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
)
elif f0_method == "rmvpe": elif f0_method == "rmvpe":
if hasattr(self, "model_rmvpe") == False: if hasattr(self, "model_rmvpe") == False:
from rmvpe import RMVPE from rmvpe import RMVPE
print("loading rmvpe model") print("loading rmvpe model")
self.model_rmvpe = RMVPE( self.model_rmvpe = RMVPE(
"rmvpe.pt", is_half=self.is_half, device=self.device "rmvpe.pt", is_half=self.is_half, device=self.device
@@ -311,7 +325,7 @@ class VC(object):
# Perform hybrid median pitch estimation # Perform hybrid median pitch estimation
input_audio_path2wav[input_audio_path] = x.astype(np.double) input_audio_path2wav[input_audio_path] = x.astype(np.double)
f0 = self.get_f0_hybrid_computation( f0 = self.get_f0_hybrid_computation(
f0_method, f0_method,
input_audio_path, input_audio_path,
x, x,
f0_min, f0_min,
@@ -319,7 +333,7 @@ class VC(object):
p_len, p_len,
filter_radius, filter_radius,
crepe_hop_length, crepe_hop_length,
time_step time_step,
) )
f0 *= pow(2, f0_up_key / 12) f0 *= pow(2, f0_up_key / 12)