Merge pull request #49 from Mangio621/TRUE-FIXED-FP16

Fix training times by fixing fp16 detection
This commit is contained in:
kalomaze
2023-07-22 22:48:22 -05:00
committed by GitHub
8 changed files with 673 additions and 424 deletions

View File

@@ -1,19 +1,73 @@
import argparse
import sys
import torch
import json
from multiprocessing import cpu_count
global usefp16
usefp16 = False
def use_fp32_config():
usefp16 = False
device_capability = 0
if torch.cuda.is_available():
device = torch.device("cuda:0") # Assuming you have only one GPU (index 0).
device_capability = torch.cuda.get_device_capability(device)[0]
if device_capability >= 7:
usefp16 = True
for config_file in ["32k.json", "40k.json", "48k.json"]:
with open(f"configs/{config_file}", "r") as d:
data = json.load(d)
if "train" in data and "fp16_run" in data["train"]:
data["train"]["fp16_run"] = True
with open(f"configs/{config_file}", "w") as d:
json.dump(data, d, indent=4)
print(f"Set fp16_run to true in {config_file}")
with open(
"trainset_preprocess_pipeline_print.py", "r", encoding="utf-8"
) as f:
strr = f.read()
strr = strr.replace("3.0", "3.7")
with open(
"trainset_preprocess_pipeline_print.py", "w", encoding="utf-8"
) as f:
f.write(strr)
else:
for config_file in ["32k.json", "40k.json", "48k.json"]:
with open(f"configs/{config_file}", "r") as f:
strr = f.read().replace("true", "false")
with open(f"configs/{config_file}", "w") as f:
f.write(strr)
with open("trainset_preprocess_pipeline_print.py", "r") as f:
strr = f.read().replace("3.7", "3.0")
with open("trainset_preprocess_pipeline_print.py", "w") as f:
data = json.load(f)
if "train" in data and "fp16_run" in data["train"]:
data["train"]["fp16_run"] = False
with open(f"configs/{config_file}", "w") as d:
json.dump(data, d, indent=4)
print(f"Set fp16_run to false in {config_file}")
with open(
"trainset_preprocess_pipeline_print.py", "r", encoding="utf-8"
) as f:
strr = f.read()
strr = strr.replace("3.7", "3.0")
with open(
"trainset_preprocess_pipeline_print.py", "w", encoding="utf-8"
) as f:
f.write(strr)
else:
print(
"CUDA is not available. Make sure you have an NVIDIA GPU and CUDA installed."
)
return (usefp16, device_capability)
class Config:
@@ -51,10 +105,14 @@ class Config:
help="Do not open in browser automatically",
)
parser.add_argument( # Fork Feature. Paperspace integration for web UI
"--paperspace", action="store_true", help="Note that this argument just shares a gradio link for the web UI. Thus can be used on other non-local CLI systems."
"--paperspace",
action="store_true",
help="Note that this argument just shares a gradio link for the web UI. Thus can be used on other non-local CLI systems.",
)
parser.add_argument( # Fork Feature. Embed a CLI into the infer-web.py
"--is_cli", action="store_true", help="Use the CLI instead of setting up a gradio UI. This flag will launch an RVC text interface where you can execute functions from infer-web.py!"
"--is_cli",
action="store_true",
help="Use the CLI instead of setting up a gradio UI. This flag will launch an RVC text interface where you can execute functions from infer-web.py!",
)
cmd_opts = parser.parse_args()
@@ -95,9 +153,9 @@ class Config:
):
print("Found GPU", self.gpu_name, ", force to fp32")
self.is_half = False
use_fp32_config()
else:
print("Found GPU", self.gpu_name)
use_fp32_config()
self.gpu_mem = int(
torch.cuda.get_device_properties(i_device).total_memory
/ 1024

View File

@@ -19,9 +19,9 @@ f = open("%s/extract_f0_feature.log" % exp_dir, "a+")
DoFormant = False
with open('formanting.txt', 'r') as fvf:
with open("formanting.txt", "r") as fvf:
content = fvf.readlines()
Quefrency, Timbre = content[1].split('\n')[0], content[2].split('\n')[0]
Quefrency, Timbre = content[1].split("\n")[0], content[2].split("\n")[0]
def printt(strr):
@@ -67,9 +67,9 @@ class FeatureInput(object):
):
# Get various f0 methods from input to use in the computation stack
s = methods_str
s = s.split('hybrid')[1]
s = s.replace('[', '').replace(']', '')
methods = s.split('+')
s = s.split("hybrid")[1]
s = s.replace("[", "").replace("]", "")
methods = s.split("+")
f0_computation_stack = []
print("Calculating f0 pitch estimations for methods: %s" % str(methods))
@@ -99,7 +99,9 @@ class FeatureInput(object):
torch_device_index = 0
torch_device = None
if torch.cuda.is_available():
torch_device = torch.device(f"cuda:{torch_device_index % torch.cuda.device_count()}")
torch_device = torch.device(
f"cuda:{torch_device_index % torch.cuda.device_count()}"
)
elif torch.backends.mps.is_available():
torch_device = torch.device("mps")
else:
@@ -132,7 +134,9 @@ class FeatureInput(object):
torch_device_index = 0
torch_device = None
if torch.cuda.is_available():
torch_device = torch.device(f"cuda:{torch_device_index % torch.cuda.device_count()}")
torch_device = torch.device(
f"cuda:{torch_device_index % torch.cuda.device_count()}"
)
elif torch.backends.mps.is_available():
torch_device = torch.device("mps")
else:
@@ -156,7 +160,7 @@ class FeatureInput(object):
"full",
batch_size=crepe_hop_length * 2,
device=torch_device,
pad=True
pad=True,
)
p_len = p_len or x.shape[0] // crepe_hop_length
# Resize the pitch
@@ -165,7 +169,7 @@ class FeatureInput(object):
target = np.interp(
np.arange(0, len(source) * p_len, len(source)) / p_len,
np.arange(0, len(source)),
source
source,
)
f0 = np.nan_to_num(target)
elif method == "harvest":
@@ -236,10 +240,9 @@ class FeatureInput(object):
elif f0_method == "rmvpe":
if hasattr(self, "model_rmvpe") == False:
from rmvpe import RMVPE
print("loading rmvpe model")
self.model_rmvpe = RMVPE(
"rmvpe.pt", is_half=False, device="cuda:0"
)
self.model_rmvpe = RMVPE("rmvpe.pt", is_half=False, device="cuda:0")
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
elif f0_method == "dio":
f0, t = pyworld.dio(
@@ -250,12 +253,16 @@ class FeatureInput(object):
frame_period=1000 * self.hop / self.fs,
)
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs)
elif f0_method == "crepe": # Fork Feature: Added crepe f0 for f0 feature extraction
elif (
f0_method == "crepe"
): # Fork Feature: Added crepe f0 for f0 feature extraction
# Pick a batch size that doesn't cause memory errors on your gpu
torch_device_index = 0
torch_device = None
if torch.cuda.is_available():
torch_device = torch.device(f"cuda:{torch_device_index % torch.cuda.device_count()}")
torch_device = torch.device(
f"cuda:{torch_device_index % torch.cuda.device_count()}"
)
elif torch.backends.mps.is_available():
torch_device = torch.device("mps")
else:
@@ -287,7 +294,9 @@ class FeatureInput(object):
torch_device_index = 0
torch_device = None
if torch.cuda.is_available():
torch_device = torch.device(f"cuda:{torch_device_index % torch.cuda.device_count()}")
torch_device = torch.device(
f"cuda:{torch_device_index % torch.cuda.device_count()}"
)
elif torch.backends.mps.is_available():
torch_device = torch.device("mps")
else:
@@ -311,7 +320,7 @@ class FeatureInput(object):
"full",
batch_size=crepe_hop_length * 2,
device=torch_device,
pad=True
pad=True,
)
p_len = p_len or x.shape[0] // crepe_hop_length
# Resize the pitch
@@ -320,7 +329,7 @@ class FeatureInput(object):
target = np.interp(
np.arange(0, len(source) * p_len, len(source)) / p_len,
np.arange(0, len(source)),
source
source,
)
f0 = np.nan_to_num(target)
elif "hybrid" in f0_method: # EXPERIMENTAL
@@ -333,7 +342,7 @@ class FeatureInput(object):
self.f0_max,
p_len,
crepe_hop_length,
time_step
time_step,
)
# Mangio-RVC-Fork Feature: Add hybrid f0 inference to feature extraction. EXPERIMENTAL...
@@ -362,14 +371,19 @@ class FeatureInput(object):
with tqdm.tqdm(total=len(paths), leave=True, position=thread_n) as pbar:
for idx, (inp_path, opt_path1, opt_path2) in enumerate(paths):
try:
pbar.set_description("thread:%s, f0ing, Hop-Length:%s" % (thread_n, crepe_hop_length))
pbar.set_description(
"thread:%s, f0ing, Hop-Length:%s"
% (thread_n, crepe_hop_length)
)
pbar.update(1)
if (
os.path.exists(opt_path1 + ".npy") == True
and os.path.exists(opt_path2 + ".npy") == True
):
continue
featur_pit = self.compute_f0(inp_path, f0_method, crepe_hop_length)
featur_pit = self.compute_f0(
inp_path, f0_method, crepe_hop_length
)
np.save(
opt_path2,
featur_pit,
@@ -382,7 +396,9 @@ class FeatureInput(object):
allow_pickle=False,
) # ori
except:
printt("f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc()))
printt(
"f0fail-%s-%s-%s" % (idx, inp_path, traceback.format_exc())
)
if __name__ == "__main__":
@@ -411,12 +427,7 @@ if __name__ == "__main__":
for i in range(n_p):
p = Process(
target=featureInput.go,
args=(
paths[i::n_p],
f0method,
extraction_crepe_hop_length,
i
),
args=(paths[i::n_p], f0method, extraction_crepe_hop_length, i),
)
ps.append(p)
p.start()

View File

@@ -51,8 +51,10 @@ class RVC:
self.window = 160
# Get Torch Device
if(torch.cuda.is_available()):
self.torch_device = torch.device(f"cuda:{0 % torch.cuda.device_count()}")
if torch.cuda.is_available():
self.torch_device = torch.device(
f"cuda:{0 % torch.cuda.device_count()}"
)
elif torch.backends.mps.is_available():
self.torch_device = torch.device("mps")
else:
@@ -150,11 +152,11 @@ class RVC:
f0 = 0
# Here, check f0_methods and get their computations
if(self.f0_method == 'harvest'):
if self.f0_method == "harvest":
f0 = self.get_harvest_computation(x, f0_min, f0_max)
elif(self.f0_method == 'reg-crepe'):
elif self.f0_method == "reg-crepe":
f0 = self.get_regular_crepe_computation(x, f0_min, f0_max)
elif(self.f0_method == 'reg-crepe-tiny'):
elif self.f0_method == "reg-crepe-tiny":
f0 = self.get_regular_crepe_computation(x, f0_min, f0_max, "tiny")
# Calculate f0_course and f0_bak here
@@ -328,11 +330,7 @@ class GUI:
[
sg.Frame(
title="Proudly forked by Mangio621",
layout=[
[
sg.Image('./mangio_utils/lol.png')
]
]
layout=[[sg.Image("./mangio_utils/lol.png")]],
),
sg.Frame(
title=i18n("加载模型"),
@@ -384,14 +382,16 @@ class GUI:
),
],
],
)
),
],
[
# Mangio f0 Selection frame Here
sg.Frame(
layout=[
[
sg.Radio("Harvest", "f0_method", key="harvest", default=True),
sg.Radio(
"Harvest", "f0_method", key="harvest", default=True
),
sg.Radio("Crepe", "f0_method", key="reg-crepe"),
sg.Radio("Crepe Tiny", "f0_method", key="reg-crepe-tiny"),
]
@@ -539,17 +539,18 @@ class GUI:
# Function that returns the used f0 method in string format "harvest"
def get_f0_method_from_radios(self, values):
f0_array = [
{"name": "harvest", "val": values['harvest']},
{"name": "reg-crepe", "val": values['reg-crepe']},
{"name": "reg-crepe-tiny", "val": values['reg-crepe-tiny']},
{"name": "harvest", "val": values["harvest"]},
{"name": "reg-crepe", "val": values["reg-crepe"]},
{"name": "reg-crepe-tiny", "val": values["reg-crepe-tiny"]},
]
# Filter through to find a true value
used_f0 = ""
for f0 in f0_array:
if(f0['val'] == True):
used_f0 = f0['name']
if f0["val"] == True:
used_f0 = f0["name"]
break
if(used_f0 == ""): used_f0 = "harvest" # Default Harvest if used_f0 is empty somehow
if used_f0 == "":
used_f0 = "harvest" # Default Harvest if used_f0 is empty somehow
return used_f0
def set_values(self, values):

File diff suppressed because it is too large Load Diff

View File

@@ -1,11 +1,14 @@
import ffmpeg
import numpy as np
#import praatio
#import praatio.praat_scripts
import os
#from os.path import join
#praatEXE = join('.',os.path.abspath(os.getcwd()) + r"\Praat.exe")
# import praatio
# import praatio.praat_scripts
import os
# from os.path import join
# praatEXE = join('.',os.path.abspath(os.getcwd()) + r"\Praat.exe")
def load_audio(file, sr, DoFormant, Quefrency, Timbre):
try:
@@ -15,43 +18,47 @@ def load_audio(file, sr, DoFormant, Quefrency, Timbre):
file = (
file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
) # 防止小白拷路径头尾带了空格和"和回车
file_formanted = (
file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
)
with open('formanting.txt', 'r') as fvf:
file_formanted = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
with open("formanting.txt", "r") as fvf:
content = fvf.readlines()
if 'True' in content[0].split('\n')[0]:
#print("true")
if "True" in content[0].split("\n")[0]:
# print("true")
DoFormant = True
Quefrency, Timbre = content[1].split('\n')[0], content[2].split('\n')[0]
Quefrency, Timbre = content[1].split("\n")[0], content[2].split("\n")[0]
else:
#print("not true")
# print("not true")
DoFormant = False
if DoFormant:
#os.system(f"stftpitchshift -i {file} -q {Quefrency} -t {Timbre} -o {file_formanted}")
#print('stftpitchshift -i "%s" -p 1.0 --rms -w 128 -v 8 -q %s -t %s -o "%s"' % (file, Quefrency, Timbre, file_formanted))
# os.system(f"stftpitchshift -i {file} -q {Quefrency} -t {Timbre} -o {file_formanted}")
# print('stftpitchshift -i "%s" -p 1.0 --rms -w 128 -v 8 -q %s -t %s -o "%s"' % (file, Quefrency, Timbre, file_formanted))
print("formanting...")
os.system('stftpitchshift -i "%s" -q %s -t %s -o "%sFORMANTED"' % (file, Quefrency, Timbre, file_formanted))
os.system(
'stftpitchshift -i "%s" -q %s -t %s -o "%sFORMANTED"'
% (file, Quefrency, Timbre, file_formanted)
)
print("formanted!")
#filepraat = (os.path.abspath(os.getcwd()) + '\\' + file).replace('/','\\')
#file_formantedpraat = ('"' + os.path.abspath(os.getcwd()) + '/' + 'formanted'.join(file_formanted) + '"').replace('/','\\')
# filepraat = (os.path.abspath(os.getcwd()) + '\\' + file).replace('/','\\')
# file_formantedpraat = ('"' + os.path.abspath(os.getcwd()) + '/' + 'formanted'.join(file_formanted) + '"').replace('/','\\')
out, _ = (
ffmpeg.input('%sFORMANTED%s' % (file_formanted, '.wav'), threads=0)
ffmpeg.input("%sFORMANTED%s" % (file_formanted, ".wav"), threads=0)
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
.run(
cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True
)
)
os.remove('%sFORMANTED%s' % (file_formanted, '.wav'))
os.remove("%sFORMANTED%s" % (file_formanted, ".wav"))
else:
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
.run(
cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True
)
)
except Exception as e:
raise RuntimeError(f"Failed to load audio: {e}")

View File

@@ -571,7 +571,7 @@ def train_and_evaluate(
with open("stop.txt", "r+") as tostop:
content = tostop.read()
if 'stop' in content:
if "stop" in content:
logger.info("Stop Button was pressed. The program is closed.")
if hasattr(net_g, "module"):
ckpt = net_g.module.state_dict()
@@ -581,7 +581,13 @@ def train_and_evaluate(
"saving final ckpt:%s"
% (
savee(
ckpt, hps.sample_rate, hps.if_f0, hps.name, epoch, hps.version, hps
ckpt,
hps.sample_rate,
hps.if_f0,
hps.name,
epoch,
hps.version,
hps,
)
)
)

View File

@@ -24,9 +24,10 @@ Timbre = 0.0
mutex = multiprocessing.Lock()
f = open("%s/preprocess.log" % exp_dir, "a+")
with open('formanting.txt', 'r') as fvf:
with open("formanting.txt", "r") as fvf:
content = fvf.readlines()
Quefrency, Timbre = content[1].split('\n')[0], content[2].split('\n')[0]
Quefrency, Timbre = content[1].split("\n")[0], content[2].split("\n")[0]
def println(strr):
mutex.acquire()
@@ -104,12 +105,14 @@ class PreProcess:
idx1 += 1
break
self.norm_write(tmp_audio, idx0, idx1)
#println("%s->Suc." % path)
# println("%s->Suc." % path)
except:
println("%s->%s" % (path, traceback.format_exc()))
def pipeline_mp(self, infos, thread_n):
for path, idx0 in tqdm.tqdm(infos, position=thread_n, leave=True, desc="thread:%s" % thread_n):
for path, idx0 in tqdm.tqdm(
infos, position=thread_n, leave=True, desc="thread:%s" % thread_n
):
self.pipeline(path, idx0)
def pipeline_mp_inp_dir(self, inp_root, n_p):

View File

@@ -15,6 +15,7 @@ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
input_audio_path2wav = {}
@lru_cache
def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
audio = input_audio_path2wav[input_audio_path]
@@ -74,7 +75,9 @@ class VC(object):
def get_optimal_torch_device(self, index: int = 0) -> torch.device:
# Get cuda device
if torch.cuda.is_available():
return torch.device(f"cuda:{index % torch.cuda.device_count()}") # Very fast
return torch.device(
f"cuda:{index % torch.cuda.device_count()}"
) # Very fast
elif torch.backends.mps.is_available():
return torch.device("mps")
# Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
@@ -91,7 +94,9 @@ class VC(object):
hop_length=160, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full
):
x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float.
x = x.astype(
np.float32
) # fixes the F.conv2D exception. We needed to convert double to float.
x /= np.quantile(np.abs(x), 0.999)
torch_device = self.get_optimal_torch_device()
audio = torch.from_numpy(x).to(torch_device, copy=True)
@@ -109,7 +114,7 @@ class VC(object):
model,
batch_size=hop_length * 2,
device=torch_device,
pad=True
pad=True,
)
p_len = p_len or x.shape[0] // hop_length
# Resize the pitch for final f0
@@ -118,7 +123,7 @@ class VC(object):
target = np.interp(
np.arange(0, len(source) * p_len, len(source)) / p_len,
np.arange(0, len(source)),
source
source,
)
f0 = np.nan_to_num(target)
return f0 # Resized f0
@@ -153,7 +158,7 @@ class VC(object):
# Fork Feature: Compute pYIN f0 method
def get_f0_pyin_computation(self, x, f0_min, f0_max):
y, sr = librosa.load('saudio/Sidney.wav', self.sr, mono=True)
y, sr = librosa.load("saudio/Sidney.wav", self.sr, mono=True)
f0, _, _ = librosa.pyin(y, sr=self.sr, fmin=f0_min, fmax=f0_max)
f0 = f0[1:] # Get rid of extra first frame
return f0
@@ -173,9 +178,9 @@ class VC(object):
):
# Get various f0 methods from input to use in the computation stack
s = methods_str
s = s.split('hybrid')[1]
s = s.replace('[', '').replace(']', '')
methods = s.split('+')
s = s.split("hybrid")[1]
s = s.replace("[", "").replace("]", "")
methods = s.split("+")
f0_computation_stack = []
print("Calculating f0 pitch estimations for methods: %s" % str(methods))
@@ -207,9 +212,13 @@ class VC(object):
f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
f0 = f0[1:] # Get rid of extra first frame
elif method == "mangio-crepe":
f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length)
f0 = self.get_f0_crepe_computation(
x, f0_min, f0_max, p_len, crepe_hop_length
)
elif method == "mangio-crepe-tiny":
f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length, "tiny")
f0 = self.get_f0_crepe_computation(
x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
)
elif method == "harvest":
f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
if filter_radius > 2:
@@ -221,12 +230,12 @@ class VC(object):
fs=self.sr,
f0_ceil=f0_max,
f0_floor=f0_min,
frame_period=10
frame_period=10,
)
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
f0 = signal.medfilt(f0, 3)
f0 = f0[1:]
#elif method == "pyin": Not Working just yet
# elif method == "pyin": Not Working just yet
# f0 = self.get_f0_pyin_computation(x, f0_min, f0_max)
# Push method to the stack
f0_computation_stack.append(f0)
@@ -286,7 +295,7 @@ class VC(object):
fs=self.sr,
f0_ceil=f0_max,
f0_floor=f0_min,
frame_period=10
frame_period=10,
)
f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
f0 = signal.medfilt(f0, 3)
@@ -295,12 +304,17 @@ class VC(object):
elif f0_method == "crepe-tiny":
f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "tiny")
elif f0_method == "mangio-crepe":
f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length)
f0 = self.get_f0_crepe_computation(
x, f0_min, f0_max, p_len, crepe_hop_length
)
elif f0_method == "mangio-crepe-tiny":
f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, crepe_hop_length, "tiny")
f0 = self.get_f0_crepe_computation(
x, f0_min, f0_max, p_len, crepe_hop_length, "tiny"
)
elif f0_method == "rmvpe":
if hasattr(self, "model_rmvpe") == False:
from rmvpe import RMVPE
print("loading rmvpe model")
self.model_rmvpe = RMVPE(
"rmvpe.pt", is_half=self.is_half, device=self.device
@@ -319,7 +333,7 @@ class VC(object):
p_len,
filter_radius,
crepe_hop_length,
time_step
time_step,
)
f0 *= pow(2, f0_up_key / 12)