Reformat and rewrite _get_name_params (#57)

* Reformat

* rewrite _get_name_params

* Add workflow for automatic formatting

* Revert "Add workflow for automatic formatting"

This reverts commit 9111c5dbc1830248305fb075587a88be07ad3115.

* revert Retrieval_based_Voice_Conversion_WebUI.ipynb

---------

Co-authored-by: 源文雨 <41315874+fumiama@users.noreply.github.com>
This commit is contained in:
Ftps
2023-04-15 20:44:24 +09:00
committed by GitHub
parent aaa893c4b1
commit c8261b2ccc
45 changed files with 4878 additions and 2456 deletions

View File

@@ -10,7 +10,6 @@ from uvr5_pack.lib_v5 import spec_utils
class VocalRemoverValidationSet(torch.utils.data.Dataset):
def __init__(self, patch_list):
self.patch_list = patch_list
@@ -21,7 +20,7 @@ class VocalRemoverValidationSet(torch.utils.data.Dataset):
path = self.patch_list[idx]
data = np.load(path)
X, y = data['X'], data['y']
X, y = data["X"], data["y"]
X_mag = np.abs(X)
y_mag = np.abs(y)
@@ -30,16 +29,22 @@ class VocalRemoverValidationSet(torch.utils.data.Dataset):
def make_pair(mix_dir, inst_dir):
input_exts = ['.wav', '.m4a', '.mp3', '.mp4', '.flac']
input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"]
X_list = sorted([
os.path.join(mix_dir, fname)
for fname in os.listdir(mix_dir)
if os.path.splitext(fname)[1] in input_exts])
y_list = sorted([
os.path.join(inst_dir, fname)
for fname in os.listdir(inst_dir)
if os.path.splitext(fname)[1] in input_exts])
X_list = sorted(
[
os.path.join(mix_dir, fname)
for fname in os.listdir(mix_dir)
if os.path.splitext(fname)[1] in input_exts
]
)
y_list = sorted(
[
os.path.join(inst_dir, fname)
for fname in os.listdir(inst_dir)
if os.path.splitext(fname)[1] in input_exts
]
)
filelist = list(zip(X_list, y_list))
@@ -47,10 +52,11 @@ def make_pair(mix_dir, inst_dir):
def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
if split_mode == 'random':
if split_mode == "random":
filelist = make_pair(
os.path.join(dataset_dir, 'mixtures'),
os.path.join(dataset_dir, 'instruments'))
os.path.join(dataset_dir, "mixtures"),
os.path.join(dataset_dir, "instruments"),
)
random.shuffle(filelist)
@@ -60,19 +66,23 @@ def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
val_filelist = filelist[-val_size:]
else:
train_filelist = [
pair for pair in filelist
if list(pair) not in val_filelist]
elif split_mode == 'subdirs':
pair for pair in filelist if list(pair) not in val_filelist
]
elif split_mode == "subdirs":
if len(val_filelist) != 0:
raise ValueError('The `val_filelist` option is not available in `subdirs` mode')
raise ValueError(
"The `val_filelist` option is not available in `subdirs` mode"
)
train_filelist = make_pair(
os.path.join(dataset_dir, 'training/mixtures'),
os.path.join(dataset_dir, 'training/instruments'))
os.path.join(dataset_dir, "training/mixtures"),
os.path.join(dataset_dir, "training/instruments"),
)
val_filelist = make_pair(
os.path.join(dataset_dir, 'validation/mixtures'),
os.path.join(dataset_dir, 'validation/instruments'))
os.path.join(dataset_dir, "validation/mixtures"),
os.path.join(dataset_dir, "validation/instruments"),
)
return train_filelist, val_filelist
@@ -81,7 +91,9 @@ def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
perm = np.random.permutation(len(X))
for i, idx in enumerate(tqdm(perm)):
if np.random.uniform() < reduction_rate:
y[idx] = spec_utils.reduce_vocal_aggressively(X[idx], y[idx], reduction_mask)
y[idx] = spec_utils.reduce_vocal_aggressively(
X[idx], y[idx], reduction_mask
)
if np.random.uniform() < 0.5:
# swap channel
@@ -116,10 +128,8 @@ def make_padding(width, cropsize, offset):
def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset):
len_dataset = patches * len(filelist)
X_dataset = np.zeros(
(len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
y_dataset = np.zeros(
(len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
for i, (X_path, y_path) in enumerate(tqdm(filelist)):
X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
@@ -127,22 +137,24 @@ def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset
X, y = X / coef, y / coef
l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant')
y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode='constant')
X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches)
ends = starts + cropsize
for j in range(patches):
idx = i * patches + j
X_dataset[idx] = X_pad[:, :, starts[j]:ends[j]]
y_dataset[idx] = y_pad[:, :, starts[j]:ends[j]]
X_dataset[idx] = X_pad[:, :, starts[j] : ends[j]]
y_dataset[idx] = y_pad[:, :, starts[j] : ends[j]]
return X_dataset, y_dataset
def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
patch_list = []
patch_dir = 'cs{}_sr{}_hl{}_nf{}_of{}'.format(cropsize, sr, hop_length, n_fft, offset)
patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(
cropsize, sr, hop_length, n_fft, offset
)
os.makedirs(patch_dir, exist_ok=True)
for i, (X_path, y_path) in enumerate(tqdm(filelist)):
@@ -153,18 +165,19 @@ def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
X, y = X / coef, y / coef
l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode='constant')
y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode='constant')
X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
len_dataset = int(np.ceil(X.shape[2] / roi_size))
for j in range(len_dataset):
outpath = os.path.join(patch_dir, '{}_p{}.npz'.format(basename, j))
outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j))
start = j * roi_size
if not os.path.exists(outpath):
np.savez(
outpath,
X=X_pad[:, :, start:start + cropsize],
y=y_pad[:, :, start:start + cropsize])
X=X_pad[:, :, start : start + cropsize],
y=y_pad[:, :, start : start + cropsize],
)
patch_list.append(outpath)
return VocalRemoverValidationSet(patch_list)

View File

@@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nout,
nin,
nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False),
bias=False,
),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nin,
nin,
nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False),
nn.Conv2d(
nin, nout,
kernel_size=1,
bias=False),
bias=False,
),
nn.Conv2d(nin, nout, kernel_size=1, bias=False),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@@ -65,14 +63,15 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
@@ -85,28 +84,31 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ),
nn.Dropout2d(0.1)
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)

View File

@@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nout,
nin,
nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False),
bias=False,
),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nin,
nin,
nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False),
nn.Conv2d(
nin, nout,
kernel_size=1,
bias=False),
bias=False,
),
nn.Conv2d(nin, nout, kernel_size=1, bias=False),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@@ -65,14 +63,15 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
@@ -85,28 +84,31 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ),
nn.Dropout2d(0.1)
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)

View File

@@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nout,
nin,
nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False),
bias=False,
),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nin,
nin,
nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False),
nn.Conv2d(
nin, nout,
kernel_size=1,
bias=False),
bias=False,
),
nn.Conv2d(nin, nout, kernel_size=1, bias=False),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@@ -65,14 +63,15 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
@@ -85,28 +84,31 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ),
nn.Dropout2d(0.1)
Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)

View File

@@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nout,
nin,
nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False),
bias=False,
),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nin,
nin,
nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False),
nn.Conv2d(
nin, nout,
kernel_size=1,
bias=False),
bias=False,
),
nn.Conv2d(nin, nout, kernel_size=1, bias=False),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@@ -65,14 +63,15 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
@@ -85,32 +84,37 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv6 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv7 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ),
nn.Dropout2d(0.1)
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)

View File

@@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nout,
nin,
nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False),
bias=False,
),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nin,
nin,
nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False),
nn.Conv2d(
nin, nout,
kernel_size=1,
bias=False),
bias=False,
),
nn.Conv2d(nin, nout, kernel_size=1, bias=False),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@@ -65,14 +63,15 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
@@ -85,32 +84,37 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv6 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv7 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ),
nn.Dropout2d(0.1)
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)

View File

@@ -6,19 +6,20 @@ from uvr5_pack.lib_v5 import spec_utils
class Conv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(Conv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nout,
nin,
nout,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
bias=False),
bias=False,
),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@@ -26,24 +27,22 @@ class Conv2DBNActiv(nn.Module):
class SeperableConv2DBNActiv(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
super(SeperableConv2DBNActiv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(
nin, nin,
nin,
nin,
kernel_size=ksize,
stride=stride,
padding=pad,
dilation=dilation,
groups=nin,
bias=False),
nn.Conv2d(
nin, nout,
kernel_size=1,
bias=False),
bias=False,
),
nn.Conv2d(nin, nout, kernel_size=1, bias=False),
nn.BatchNorm2d(nout),
activ()
activ(),
)
def __call__(self, x):
@@ -51,7 +50,6 @@ class SeperableConv2DBNActiv(nn.Module):
class Encoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
super(Encoder, self).__init__()
self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
@@ -65,14 +63,15 @@ class Encoder(nn.Module):
class Decoder(nn.Module):
def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False):
def __init__(
self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
):
super(Decoder, self).__init__()
self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
self.dropout = nn.Dropout2d(0.1) if dropout else None
def __call__(self, x, skip=None):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True)
x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
if skip is not None:
skip = spec_utils.crop_center(skip, x)
x = torch.cat([x, skip], dim=1)
@@ -85,32 +84,37 @@ class Decoder(nn.Module):
class ASPPModule(nn.Module):
def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
super(ASPPModule, self).__init__()
self.conv1 = nn.Sequential(
nn.AdaptiveAvgPool2d((1, None)),
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
)
self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
self.conv3 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ)
nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
)
self.conv4 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ)
nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
)
self.conv5 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv6 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.conv7 = SeperableConv2DBNActiv(
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ)
nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
)
self.bottleneck = nn.Sequential(
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ),
nn.Dropout2d(0.1)
Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
)
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
feat1 = F.interpolate(
self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
)
feat2 = self.conv2(x)
feat3 = self.conv3(x)
feat4 = self.conv4(x)

View File

@@ -3,33 +3,33 @@ import os
import pathlib
default_param = {}
default_param['bins'] = 768
default_param['unstable_bins'] = 9 # training only
default_param['reduction_bins'] = 762 # training only
default_param['sr'] = 44100
default_param['pre_filter_start'] = 757
default_param['pre_filter_stop'] = 768
default_param['band'] = {}
default_param["bins"] = 768
default_param["unstable_bins"] = 9 # training only
default_param["reduction_bins"] = 762 # training only
default_param["sr"] = 44100
default_param["pre_filter_start"] = 757
default_param["pre_filter_stop"] = 768
default_param["band"] = {}
default_param['band'][1] = {
'sr': 11025,
'hl': 128,
'n_fft': 960,
'crop_start': 0,
'crop_stop': 245,
'lpf_start': 61, # inference only
'res_type': 'polyphase'
default_param["band"][1] = {
"sr": 11025,
"hl": 128,
"n_fft": 960,
"crop_start": 0,
"crop_stop": 245,
"lpf_start": 61, # inference only
"res_type": "polyphase",
}
default_param['band'][2] = {
'sr': 44100,
'hl': 512,
'n_fft': 1536,
'crop_start': 24,
'crop_stop': 547,
'hpf_start': 81, # inference only
'res_type': 'sinc_best'
default_param["band"][2] = {
"sr": 44100,
"hl": 512,
"n_fft": 1536,
"crop_start": 24,
"crop_stop": 547,
"hpf_start": 81, # inference only
"res_type": "sinc_best",
}
@@ -40,21 +40,30 @@ def int_keys(d):
k = int(k)
r[k] = v
return r
class ModelParameters(object):
def __init__(self, config_path=''):
if '.pth' == pathlib.Path(config_path).suffix:
def __init__(self, config_path=""):
if ".pth" == pathlib.Path(config_path).suffix:
import zipfile
with zipfile.ZipFile(config_path, 'r') as zip:
self.param = json.loads(zip.read('param.json'), object_pairs_hook=int_keys)
elif '.json' == pathlib.Path(config_path).suffix:
with open(config_path, 'r') as f:
with zipfile.ZipFile(config_path, "r") as zip:
self.param = json.loads(
zip.read("param.json"), object_pairs_hook=int_keys
)
elif ".json" == pathlib.Path(config_path).suffix:
with open(config_path, "r") as f:
self.param = json.loads(f.read(), object_pairs_hook=int_keys)
else:
self.param = default_param
for k in ['mid_side', 'mid_side_b', 'mid_side_b2', 'stereo_w', 'stereo_n', 'reverse']:
for k in [
"mid_side",
"mid_side_b",
"mid_side_b2",
"stereo_w",
"stereo_n",
"reverse",
]:
if not k in self.param:
self.param[k] = False
self.param[k] = False

View File

@@ -7,7 +7,6 @@ from uvr5_pack.lib_v5 import spec_utils
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@@ -39,7 +38,6 @@ class BaseASPPNet(nn.Module):
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 16)
@@ -64,13 +62,16 @@ class CascadedASPPNet(nn.Module):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
aux1 = torch.cat(
[
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@@ -82,24 +83,33 @@ class CascadedASPPNet(nn.Module):
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
mode="replicate",
)
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix
else:
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix
@@ -107,7 +117,7 @@ class CascadedASPPNet(nn.Module):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0
return h

View File

@@ -6,7 +6,6 @@ from uvr5_pack.lib_v5 import layers_123821KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@@ -38,7 +37,6 @@ class BaseASPPNet(nn.Module):
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 32)
@@ -63,13 +61,16 @@ class CascadedASPPNet(nn.Module):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
aux1 = torch.cat(
[
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@@ -81,24 +82,33 @@ class CascadedASPPNet(nn.Module):
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
mode="replicate",
)
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix
@@ -106,7 +116,7 @@ class CascadedASPPNet(nn.Module):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0
return h

View File

@@ -6,7 +6,6 @@ from uvr5_pack.lib_v5 import layers_123821KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@@ -38,7 +37,6 @@ class BaseASPPNet(nn.Module):
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 32)
@@ -63,13 +61,16 @@ class CascadedASPPNet(nn.Module):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
aux1 = torch.cat(
[
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@@ -81,24 +82,33 @@ class CascadedASPPNet(nn.Module):
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
mode="replicate",
)
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix
@@ -106,7 +116,7 @@ class CascadedASPPNet(nn.Module):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0
return h

View File

@@ -6,7 +6,6 @@ from uvr5_pack.lib_v5 import layers_33966KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16, 32)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@@ -38,7 +37,6 @@ class BaseASPPNet(nn.Module):
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 16)
@@ -63,13 +61,16 @@ class CascadedASPPNet(nn.Module):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
aux1 = torch.cat(
[
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@@ -81,24 +82,33 @@ class CascadedASPPNet(nn.Module):
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
mode="replicate",
)
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix
@@ -106,7 +116,7 @@ class CascadedASPPNet(nn.Module):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0
return h

View File

@@ -7,7 +7,6 @@ from uvr5_pack.lib_v5 import layers_537238KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@@ -39,7 +38,6 @@ class BaseASPPNet(nn.Module):
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 64)
@@ -64,13 +62,16 @@ class CascadedASPPNet(nn.Module):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
aux1 = torch.cat(
[
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@@ -82,24 +83,33 @@ class CascadedASPPNet(nn.Module):
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
mode="replicate",
)
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix
@@ -107,7 +117,7 @@ class CascadedASPPNet(nn.Module):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0
return h

View File

@@ -7,7 +7,6 @@ from uvr5_pack.lib_v5 import layers_537238KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@@ -39,7 +38,6 @@ class BaseASPPNet(nn.Module):
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 64)
@@ -64,13 +62,16 @@ class CascadedASPPNet(nn.Module):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
aux1 = torch.cat(
[
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@@ -82,24 +83,33 @@ class CascadedASPPNet(nn.Module):
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
mode="replicate",
)
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix
@@ -107,7 +117,7 @@ class CascadedASPPNet(nn.Module):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0
return h

View File

@@ -6,7 +6,6 @@ from uvr5_pack.lib_v5 import layers_123821KB as layers
class BaseASPPNet(nn.Module):
def __init__(self, nin, ch, dilations=(4, 8, 16)):
super(BaseASPPNet, self).__init__()
self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
@@ -38,7 +37,6 @@ class BaseASPPNet(nn.Module):
class CascadedASPPNet(nn.Module):
def __init__(self, n_fft):
super(CascadedASPPNet, self).__init__()
self.stg1_low_band_net = BaseASPPNet(2, 32)
@@ -63,13 +61,16 @@ class CascadedASPPNet(nn.Module):
mix = x.detach()
x = x.clone()
x = x[:, :, :self.max_bin]
x = x[:, :, : self.max_bin]
bandw = x.size()[2] // 2
aux1 = torch.cat([
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:])
], dim=2)
aux1 = torch.cat(
[
self.stg1_low_band_net(x[:, :, :bandw]),
self.stg1_high_band_net(x[:, :, bandw:]),
],
dim=2,
)
h = torch.cat([x, aux1], dim=1)
aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
@@ -81,24 +82,33 @@ class CascadedASPPNet(nn.Module):
mask = F.pad(
input=mask,
pad=(0, 0, 0, self.output_bin - mask.size()[2]),
mode='replicate')
mode="replicate",
)
if self.training:
aux1 = torch.sigmoid(self.aux1_out(aux1))
aux1 = F.pad(
input=aux1,
pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
mode='replicate')
mode="replicate",
)
aux2 = torch.sigmoid(self.aux2_out(aux2))
aux2 = F.pad(
input=aux2,
pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
mode='replicate')
mode="replicate",
)
return mask * mix, aux1 * mix, aux2 * mix
else:
if aggressiveness:
mask[:, :, :aggressiveness['split_bin']] = torch.pow(mask[:, :, :aggressiveness['split_bin']], 1 + aggressiveness['value'] / 3)
mask[:, :, aggressiveness['split_bin']:] = torch.pow(mask[:, :, aggressiveness['split_bin']:], 1 + aggressiveness['value'])
mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
mask[:, :, : aggressiveness["split_bin"]],
1 + aggressiveness["value"] / 3,
)
mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
mask[:, :, aggressiveness["split_bin"] :],
1 + aggressiveness["value"],
)
return mask * mix
@@ -106,7 +116,7 @@ class CascadedASPPNet(nn.Module):
h = self.forward(x_mag, aggressiveness)
if self.offset > 0:
h = h[:, :, :, self.offset:-self.offset]
h = h[:, :, :, self.offset : -self.offset]
assert h.size()[3] > 0
return h

View File

@@ -1,8 +1,9 @@
import os,librosa
import numpy as np
import soundfile as sf
import os, librosa
import numpy as np
import soundfile as sf
from tqdm import tqdm
import json,math ,hashlib
import json, math, hashlib
def crop_center(h1, h2):
h1_shape = h1.size()
@@ -11,7 +12,7 @@ def crop_center(h1, h2):
if h1_shape[3] == h2_shape[3]:
return h1
elif h1_shape[3] < h2_shape[3]:
raise ValueError('h1_shape[3] must be greater than h2_shape[3]')
raise ValueError("h1_shape[3] must be greater than h2_shape[3]")
# s_freq = (h2_shape[2] - h1_shape[2]) // 2
# e_freq = s_freq + h1_shape[2]
@@ -22,7 +23,9 @@ def crop_center(h1, h2):
return h1
def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
def wave_to_spectrogram(
wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
):
if reverse:
wave_left = np.flip(np.asfortranarray(wave[0]))
wave_right = np.flip(np.asfortranarray(wave[1]))
@@ -30,21 +33,23 @@ def wave_to_spectrogram(wave, hop_length, n_fft, mid_side=False, mid_side_b2=Fal
wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
elif mid_side_b2:
wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5))
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5))
wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
else:
wave_left = np.asfortranarray(wave[0])
wave_right = np.asfortranarray(wave[1])
spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length)
spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
spec = np.asfortranarray([spec_left, spec_right])
return spec
def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False):
def wave_to_spectrogram_mt(
wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
):
import threading
if reverse:
@@ -54,62 +59,75 @@ def wave_to_spectrogram_mt(wave, hop_length, n_fft, mid_side=False, mid_side_b2=
wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
elif mid_side_b2:
wave_left = np.asfortranarray(np.add(wave[1], wave[0] * .5))
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * .5))
wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
else:
wave_left = np.asfortranarray(wave[0])
wave_right = np.asfortranarray(wave[1])
def run_thread(**kwargs):
global spec_left
spec_left = librosa.stft(**kwargs)
thread = threading.Thread(target=run_thread, kwargs={'y': wave_left, 'n_fft': n_fft, 'hop_length': hop_length})
thread = threading.Thread(
target=run_thread,
kwargs={"y": wave_left, "n_fft": n_fft, "hop_length": hop_length},
)
thread.start()
spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
thread.join()
thread.join()
spec = np.asfortranarray([spec_left, spec_right])
return spec
def combine_spectrograms(specs, mp):
l = min([specs[i].shape[2] for i in specs])
spec_c = np.zeros(shape=(2, mp.param['bins'] + 1, l), dtype=np.complex64)
l = min([specs[i].shape[2] for i in specs])
spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64)
offset = 0
bands_n = len(mp.param['band'])
bands_n = len(mp.param["band"])
for d in range(1, bands_n + 1):
h = mp.param['band'][d]['crop_stop'] - mp.param['band'][d]['crop_start']
spec_c[:, offset:offset+h, :l] = specs[d][:, mp.param['band'][d]['crop_start']:mp.param['band'][d]['crop_stop'], :l]
h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"]
spec_c[:, offset : offset + h, :l] = specs[d][
:, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l
]
offset += h
if offset > mp.param['bins']:
raise ValueError('Too much bins')
if offset > mp.param["bins"]:
raise ValueError("Too much bins")
# lowpass fiter
if mp.param['pre_filter_start'] > 0: # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
if (
mp.param["pre_filter_start"] > 0
): # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
if bands_n == 1:
spec_c = fft_lp_filter(spec_c, mp.param['pre_filter_start'], mp.param['pre_filter_stop'])
spec_c = fft_lp_filter(
spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"]
)
else:
gp = 1
for b in range(mp.param['pre_filter_start'] + 1, mp.param['pre_filter_stop']):
g = math.pow(10, -(b - mp.param['pre_filter_start']) * (3.5 - gp) / 20.0)
gp = 1
for b in range(
mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]
):
g = math.pow(
10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0
)
gp = g
spec_c[:, b, :] *= g
return np.asfortranarray(spec_c)
def spectrogram_to_image(spec, mode='magnitude'):
if mode == 'magnitude':
return np.asfortranarray(spec_c)
def spectrogram_to_image(spec, mode="magnitude"):
if mode == "magnitude":
if np.iscomplexobj(spec):
y = np.abs(spec)
else:
y = spec
y = np.log10(y ** 2 + 1e-8)
elif mode == 'phase':
y = np.log10(y**2 + 1e-8)
elif mode == "phase":
if np.iscomplexobj(spec):
y = np.angle(spec)
else:
@@ -121,9 +139,7 @@ def spectrogram_to_image(spec, mode='magnitude'):
if y.ndim == 3:
img = img.transpose(1, 2, 0)
img = np.concatenate([
np.max(img, axis=2, keepdims=True), img
], axis=2)
img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2)
return img
@@ -136,12 +152,12 @@ def reduce_vocal_aggressively(X, y, softmask):
v_mask = v_mag_tmp > y_mag_tmp
y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)
return y_mag * np.exp(1.j * np.angle(y))
return y_mag * np.exp(1.0j * np.angle(y))
def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
if min_range < fade_size * 2:
raise ValueError('min_range must be >= fade_area * 2')
raise ValueError("min_range must be >= fade_area * 2")
mag = mag.copy()
@@ -159,72 +175,106 @@ def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
if s != 0:
weight = np.linspace(0, 1, fade_size)
mag[:, :, s:s + fade_size] += weight * ref[:, :, s:s + fade_size]
mag[:, :, s : s + fade_size] += weight * ref[:, :, s : s + fade_size]
else:
s -= fade_size
if e != mag.shape[2]:
weight = np.linspace(1, 0, fade_size)
mag[:, :, e - fade_size:e] += weight * ref[:, :, e - fade_size:e]
mag[:, :, e - fade_size : e] += weight * ref[:, :, e - fade_size : e]
else:
e += fade_size
mag[:, :, s + fade_size:e - fade_size] += ref[:, :, s + fade_size:e - fade_size]
mag[:, :, s + fade_size : e - fade_size] += ref[
:, :, s + fade_size : e - fade_size
]
old_e = e
return mag
def align_wave_head_and_tail(a, b):
l = min([a[0].size, b[0].size])
return a[:l,:l], b[:l,:l]
l = min([a[0].size, b[0].size])
return a[:l, :l], b[:l, :l]
def cache_or_load(mix_path, inst_path, mp):
mix_basename = os.path.splitext(os.path.basename(mix_path))[0]
inst_basename = os.path.splitext(os.path.basename(inst_path))[0]
cache_dir = 'mph{}'.format(hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode('utf-8')).hexdigest())
mix_cache_dir = os.path.join('cache', cache_dir)
inst_cache_dir = os.path.join('cache', cache_dir)
cache_dir = "mph{}".format(
hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest()
)
mix_cache_dir = os.path.join("cache", cache_dir)
inst_cache_dir = os.path.join("cache", cache_dir)
os.makedirs(mix_cache_dir, exist_ok=True)
os.makedirs(inst_cache_dir, exist_ok=True)
mix_cache_path = os.path.join(mix_cache_dir, mix_basename + '.npy')
inst_cache_path = os.path.join(inst_cache_dir, inst_basename + '.npy')
mix_cache_path = os.path.join(mix_cache_dir, mix_basename + ".npy")
inst_cache_path = os.path.join(inst_cache_dir, inst_basename + ".npy")
if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path):
X_spec_m = np.load(mix_cache_path)
y_spec_m = np.load(inst_cache_path)
else:
X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
for d in range(len(mp.param['band']), 0, -1):
bp = mp.param['band'][d]
if d == len(mp.param['band']): # high-end band
for d in range(len(mp.param["band"]), 0, -1):
bp = mp.param["band"][d]
if d == len(mp.param["band"]): # high-end band
X_wave[d], _ = librosa.load(
mix_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
mix_path, bp["sr"], False, dtype=np.float32, res_type=bp["res_type"]
)
y_wave[d], _ = librosa.load(
inst_path, bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
else: # lower bands
X_wave[d] = librosa.resample(X_wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
y_wave[d] = librosa.resample(y_wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
inst_path,
bp["sr"],
False,
dtype=np.float32,
res_type=bp["res_type"],
)
else: # lower bands
X_wave[d] = librosa.resample(
X_wave[d + 1],
mp.param["band"][d + 1]["sr"],
bp["sr"],
res_type=bp["res_type"],
)
y_wave[d] = librosa.resample(
y_wave[d + 1],
mp.param["band"][d + 1]["sr"],
bp["sr"],
res_type=bp["res_type"],
)
X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d])
X_spec_s[d] = wave_to_spectrogram(X_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
y_spec_s[d] = wave_to_spectrogram(y_wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
X_spec_s[d] = wave_to_spectrogram(
X_wave[d],
bp["hl"],
bp["n_fft"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
)
y_spec_s[d] = wave_to_spectrogram(
y_wave[d],
bp["hl"],
bp["n_fft"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
)
del X_wave, y_wave
X_spec_m = combine_spectrograms(X_spec_s, mp)
y_spec_m = combine_spectrograms(y_spec_s, mp)
if X_spec_m.shape != y_spec_m.shape:
raise ValueError('The combined spectrograms are different: ' + mix_path)
raise ValueError("The combined spectrograms are different: " + mix_path)
_, ext = os.path.splitext(mix_path)
@@ -244,72 +294,129 @@ def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse):
if reverse:
return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
elif mid_side:
return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
return np.asfortranarray(
[np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
)
elif mid_side_b2:
return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)])
return np.asfortranarray(
[
np.add(wave_right / 1.25, 0.4 * wave_left),
np.subtract(wave_left / 1.25, 0.4 * wave_right),
]
)
else:
return np.asfortranarray([wave_left, wave_right])
def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
import threading
spec_left = np.asfortranarray(spec[0])
spec_right = np.asfortranarray(spec[1])
def run_thread(**kwargs):
global wave_left
wave_left = librosa.istft(**kwargs)
thread = threading.Thread(target=run_thread, kwargs={'stft_matrix': spec_left, 'hop_length': hop_length})
thread = threading.Thread(
target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length}
)
thread.start()
wave_right = librosa.istft(spec_right, hop_length=hop_length)
thread.join()
thread.join()
if reverse:
return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
elif mid_side:
return np.asfortranarray([np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)])
return np.asfortranarray(
[np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
)
elif mid_side_b2:
return np.asfortranarray([np.add(wave_right / 1.25, .4 * wave_left), np.subtract(wave_left / 1.25, .4 * wave_right)])
return np.asfortranarray(
[
np.add(wave_right / 1.25, 0.4 * wave_left),
np.subtract(wave_left / 1.25, 0.4 * wave_right),
]
)
else:
return np.asfortranarray([wave_left, wave_right])
def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
wave_band = {}
bands_n = len(mp.param['band'])
bands_n = len(mp.param["band"])
offset = 0
for d in range(1, bands_n + 1):
bp = mp.param['band'][d]
spec_s = np.ndarray(shape=(2, bp['n_fft'] // 2 + 1, spec_m.shape[2]), dtype=complex)
h = bp['crop_stop'] - bp['crop_start']
spec_s[:, bp['crop_start']:bp['crop_stop'], :] = spec_m[:, offset:offset+h, :]
bp = mp.param["band"][d]
spec_s = np.ndarray(
shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex
)
h = bp["crop_stop"] - bp["crop_start"]
spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[
:, offset : offset + h, :
]
offset += h
if d == bands_n: # higher
if extra_bins_h: # if --high_end_process bypass
max_bin = bp['n_fft'] // 2
spec_s[:, max_bin-extra_bins_h:max_bin, :] = extra_bins[:, :extra_bins_h, :]
if bp['hpf_start'] > 0:
spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1)
if d == bands_n: # higher
if extra_bins_h: # if --high_end_process bypass
max_bin = bp["n_fft"] // 2
spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[
:, :extra_bins_h, :
]
if bp["hpf_start"] > 0:
spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
if bands_n == 1:
wave = spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
wave = spectrogram_to_wave(
spec_s,
bp["hl"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
)
else:
wave = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']))
wave = np.add(
wave,
spectrogram_to_wave(
spec_s,
bp["hl"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
),
)
else:
sr = mp.param['band'][d+1]['sr']
if d == 1: # lower
spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'])
wave = librosa.resample(spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']), bp['sr'], sr, res_type="sinc_fastest")
else: # mid
spec_s = fft_hp_filter(spec_s, bp['hpf_start'], bp['hpf_stop'] - 1)
spec_s = fft_lp_filter(spec_s, bp['lpf_start'], bp['lpf_stop'])
wave2 = np.add(wave, spectrogram_to_wave(spec_s, bp['hl'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse']))
sr = mp.param["band"][d + 1]["sr"]
if d == 1: # lower
spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
wave = librosa.resample(
spectrogram_to_wave(
spec_s,
bp["hl"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
),
bp["sr"],
sr,
res_type="sinc_fastest",
)
else: # mid
spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
wave2 = np.add(
wave,
spectrogram_to_wave(
spec_s,
bp["hl"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
),
)
# wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest")
wave = librosa.core.resample(wave2, bp['sr'], sr,res_type='scipy')
wave = librosa.core.resample(wave2, bp["sr"], sr, res_type="scipy")
return wave.T
@@ -318,7 +425,7 @@ def fft_lp_filter(spec, bin_start, bin_stop):
for b in range(bin_start, bin_stop):
g -= 1 / (bin_stop - bin_start)
spec[:, b, :] = g * spec[:, b, :]
spec[:, bin_stop:, :] *= 0
return spec
@@ -329,42 +436,69 @@ def fft_hp_filter(spec, bin_start, bin_stop):
for b in range(bin_start, bin_stop, -1):
g -= 1 / (bin_start - bin_stop)
spec[:, b, :] = g * spec[:, b, :]
spec[:, 0:bin_stop+1, :] *= 0
spec[:, 0 : bin_stop + 1, :] *= 0
return spec
def mirroring(a, spec_m, input_high_end, mp):
if 'mirroring' == a:
mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1)
mirror = mirror * np.exp(1.j * np.angle(input_high_end))
return np.where(np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror)
if 'mirroring2' == a:
mirror = np.flip(np.abs(spec_m[:, mp.param['pre_filter_start']-10-input_high_end.shape[1]:mp.param['pre_filter_start']-10, :]), 1)
if "mirroring" == a:
mirror = np.flip(
np.abs(
spec_m[
:,
mp.param["pre_filter_start"]
- 10
- input_high_end.shape[1] : mp.param["pre_filter_start"]
- 10,
:,
]
),
1,
)
mirror = mirror * np.exp(1.0j * np.angle(input_high_end))
return np.where(
np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror
)
if "mirroring2" == a:
mirror = np.flip(
np.abs(
spec_m[
:,
mp.param["pre_filter_start"]
- 10
- input_high_end.shape[1] : mp.param["pre_filter_start"]
- 10,
:,
]
),
1,
)
mi = np.multiply(mirror, input_high_end * 1.7)
return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
def ensembling(a, specs):
def ensembling(a, specs):
for i in range(1, len(specs)):
if i == 1:
spec = specs[0]
ln = min([spec.shape[2], specs[i].shape[2]])
spec = spec[:,:,:ln]
specs[i] = specs[i][:,:,:ln]
spec = spec[:, :, :ln]
specs[i] = specs[i][:, :, :ln]
if 'min_mag' == a:
if "min_mag" == a:
spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec)
if 'max_mag' == a:
spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec)
if "max_mag" == a:
spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec)
return spec
def stft(wave, nfft, hl):
wave_left = np.asfortranarray(wave[0])
wave_right = np.asfortranarray(wave[1])
@@ -374,6 +508,7 @@ def stft(wave, nfft, hl):
return spec
def istft(spec, hl):
spec_left = np.asfortranarray(spec[0])
spec_right = np.asfortranarray(spec[1])
@@ -389,62 +524,94 @@ if __name__ == "__main__":
import time
import argparse
from model_param_init import ModelParameters
p = argparse.ArgumentParser()
p.add_argument('--algorithm', '-a', type=str, choices=['invert', 'invert_p', 'min_mag', 'max_mag', 'deep', 'align'], default='min_mag')
p.add_argument('--model_params', '-m', type=str, default=os.path.join('modelparams', '1band_sr44100_hl512.json'))
p.add_argument('--output_name', '-o', type=str, default='output')
p.add_argument('--vocals_only', '-v', action='store_true')
p.add_argument('input', nargs='+')
p.add_argument(
"--algorithm",
"-a",
type=str,
choices=["invert", "invert_p", "min_mag", "max_mag", "deep", "align"],
default="min_mag",
)
p.add_argument(
"--model_params",
"-m",
type=str,
default=os.path.join("modelparams", "1band_sr44100_hl512.json"),
)
p.add_argument("--output_name", "-o", type=str, default="output")
p.add_argument("--vocals_only", "-v", action="store_true")
p.add_argument("input", nargs="+")
args = p.parse_args()
start_time = time.time()
if args.algorithm.startswith('invert') and len(args.input) != 2:
raise ValueError('There should be two input files.')
if not args.algorithm.startswith('invert') and len(args.input) < 2:
raise ValueError('There must be at least two input files.')
if args.algorithm.startswith("invert") and len(args.input) != 2:
raise ValueError("There should be two input files.")
if not args.algorithm.startswith("invert") and len(args.input) < 2:
raise ValueError("There must be at least two input files.")
wave, specs = {}, {}
mp = ModelParameters(args.model_params)
for i in range(len(args.input)):
for i in range(len(args.input)):
spec = {}
for d in range(len(mp.param['band']), 0, -1):
bp = mp.param['band'][d]
if d == len(mp.param['band']): # high-end band
for d in range(len(mp.param["band"]), 0, -1):
bp = mp.param["band"][d]
if d == len(mp.param["band"]): # high-end band
wave[d], _ = librosa.load(
args.input[i], bp['sr'], False, dtype=np.float32, res_type=bp['res_type'])
if len(wave[d].shape) == 1: # mono to stereo
args.input[i],
bp["sr"],
False,
dtype=np.float32,
res_type=bp["res_type"],
)
if len(wave[d].shape) == 1: # mono to stereo
wave[d] = np.array([wave[d], wave[d]])
else: # lower bands
wave[d] = librosa.resample(wave[d+1], mp.param['band'][d+1]['sr'], bp['sr'], res_type=bp['res_type'])
spec[d] = wave_to_spectrogram(wave[d], bp['hl'], bp['n_fft'], mp.param['mid_side'], mp.param['mid_side_b2'], mp.param['reverse'])
else: # lower bands
wave[d] = librosa.resample(
wave[d + 1],
mp.param["band"][d + 1]["sr"],
bp["sr"],
res_type=bp["res_type"],
)
spec[d] = wave_to_spectrogram(
wave[d],
bp["hl"],
bp["n_fft"],
mp.param["mid_side"],
mp.param["mid_side_b2"],
mp.param["reverse"],
)
specs[i] = combine_spectrograms(spec, mp)
del wave
if args.algorithm == 'deep':
if args.algorithm == "deep":
d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1])
v_spec = d_spec - specs[1]
sf.write(os.path.join('{}.wav'.format(args.output_name)), cmb_spectrogram_to_wave(v_spec, mp), mp.param['sr'])
if args.algorithm.startswith('invert'):
sf.write(
os.path.join("{}.wav".format(args.output_name)),
cmb_spectrogram_to_wave(v_spec, mp),
mp.param["sr"],
)
if args.algorithm.startswith("invert"):
ln = min([specs[0].shape[2], specs[1].shape[2]])
specs[0] = specs[0][:,:,:ln]
specs[1] = specs[1][:,:,:ln]
if 'invert_p' == args.algorithm:
specs[0] = specs[0][:, :, :ln]
specs[1] = specs[1][:, :, :ln]
if "invert_p" == args.algorithm:
X_mag = np.abs(specs[0])
y_mag = np.abs(specs[1])
max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
v_spec = specs[1] - max_mag * np.exp(1.j * np.angle(specs[0]))
y_mag = np.abs(specs[1])
max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0]))
else:
specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2)
v_spec = specs[0] - specs[1]
@@ -458,28 +625,43 @@ if __name__ == "__main__":
y_image = spectrogram_to_image(y_mag)
v_image = spectrogram_to_image(v_mag)
cv2.imwrite('{}_X.png'.format(args.output_name), X_image)
cv2.imwrite('{}_y.png'.format(args.output_name), y_image)
cv2.imwrite('{}_v.png'.format(args.output_name), v_image)
sf.write('{}_X.wav'.format(args.output_name), cmb_spectrogram_to_wave(specs[0], mp), mp.param['sr'])
sf.write('{}_y.wav'.format(args.output_name), cmb_spectrogram_to_wave(specs[1], mp), mp.param['sr'])
sf.write('{}_v.wav'.format(args.output_name), cmb_spectrogram_to_wave(v_spec, mp), mp.param['sr'])
else:
if not args.algorithm == 'deep':
sf.write(os.path.join('ensembled','{}.wav'.format(args.output_name)), cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp), mp.param['sr'])
cv2.imwrite("{}_X.png".format(args.output_name), X_image)
cv2.imwrite("{}_y.png".format(args.output_name), y_image)
cv2.imwrite("{}_v.png".format(args.output_name), v_image)
if args.algorithm == 'align':
sf.write(
"{}_X.wav".format(args.output_name),
cmb_spectrogram_to_wave(specs[0], mp),
mp.param["sr"],
)
sf.write(
"{}_y.wav".format(args.output_name),
cmb_spectrogram_to_wave(specs[1], mp),
mp.param["sr"],
)
sf.write(
"{}_v.wav".format(args.output_name),
cmb_spectrogram_to_wave(v_spec, mp),
mp.param["sr"],
)
else:
if not args.algorithm == "deep":
sf.write(
os.path.join("ensembled", "{}.wav".format(args.output_name)),
cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp),
mp.param["sr"],
)
if args.algorithm == "align":
trackalignment = [
{
'file1':'"{}"'.format(args.input[0]),
'file2':'"{}"'.format(args.input[1])
"file1": '"{}"'.format(args.input[0]),
"file2": '"{}"'.format(args.input[1]),
}
]
for i,e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."):
for i, e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."):
os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}")
#print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1))
# print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1))

263
uvr5_pack/name_params.json Normal file
View File

@@ -0,0 +1,263 @@
{
"equivalent" : [
{
"model_hash_name" : [
{
"hash_name": "47939caf0cfe52a0e81442b85b971dfd",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "4e4ecb9764c50a8c414fee6e10395bbe",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_v2.json",
"param_name": "4band_v2"
},
{
"hash_name": "ca106edd563e034bde0bdec4bb7a4b36",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_v2.json",
"param_name": "4band_v2"
},
{
"hash_name": "e60a1e84803ce4efc0a6551206cc4b71",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "a82f14e75892e55e994376edbf0c8435",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "6dd9eaa6f0420af9f1d403aaafa4cc06",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
"param_name": "4band_v2_sn"
},
{
"hash_name": "08611fb99bd59eaa79ad27c58d137727",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
"param_name": "4band_v2_sn"
},
{
"hash_name": "5c7bbca45a187e81abbbd351606164e5",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
"param_name": "3band_44100_msb2"
},
{
"hash_name": "d6b2cb685a058a091e5e7098192d3233",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
"param_name": "3band_44100_msb2"
},
{
"hash_name": "c1b9f38170a7c90e96f027992eb7c62b",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "c3448ec923fa0edf3d03a19e633faa53",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "68aa2c8093d0080704b200d140f59e54",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100.json",
"param_name": "3band_44100"
},
{
"hash_name": "fdc83be5b798e4bd29fe00fe6600e147",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
"param_name": "3band_44100_mid.json"
},
{
"hash_name": "2ce34bc92fd57f55db16b7a4def3d745",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
"param_name": "3band_44100_mid.json"
},
{
"hash_name": "52fdca89576f06cf4340b74a4730ee5f",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100.json"
},
{
"hash_name": "41191165b05d38fc77f072fa9e8e8a30",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100.json"
},
{
"hash_name": "89e83b511ad474592689e562d5b1f80e",
"model_params": "uvr5_pack/lib_v5/modelparams/2band_32000.json",
"param_name": "2band_32000.json"
},
{
"hash_name": "0b954da81d453b716b114d6d7c95177f",
"model_params": "uvr5_pack/lib_v5/modelparams/2band_32000.json",
"param_name": "2band_32000.json"
}
],
"v4 Models": [
{
"hash_name": "6a00461c51c2920fd68937d4609ed6c8",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json",
"param_name": "1band_sr16000_hl512"
},
{
"hash_name": "0ab504864d20f1bd378fe9c81ef37140",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
"param_name": "1band_sr32000_hl512"
},
{
"hash_name": "7dd21065bf91c10f7fccb57d7d83b07f",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
"param_name": "1band_sr32000_hl512"
},
{
"hash_name": "80ab74d65e515caa3622728d2de07d23",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
"param_name": "1band_sr32000_hl512"
},
{
"hash_name": "edc115e7fc523245062200c00caa847f",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
"param_name": "1band_sr33075_hl384"
},
{
"hash_name": "28063e9f6ab5b341c5f6d3c67f2045b7",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
"param_name": "1band_sr33075_hl384"
},
{
"hash_name": "b58090534c52cbc3e9b5104bad666ef2",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
"param_name": "1band_sr44100_hl512"
},
{
"hash_name": "0cdab9947f1b0928705f518f3c78ea8f",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
"param_name": "1band_sr44100_hl512"
},
{
"hash_name": "ae702fed0238afb5346db8356fe25f13",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json",
"param_name": "1band_sr44100_hl1024"
}
]
}
],
"User Models" : [
{
"1 Band": [
{
"hash_name": "1band_sr16000_hl512",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json",
"param_name": "1band_sr16000_hl512"
},
{
"hash_name": "1band_sr32000_hl512",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
"param_name": "1band_sr16000_hl512"
},
{
"hash_name": "1band_sr33075_hl384",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
"param_name": "1band_sr33075_hl384"
},
{
"hash_name": "1band_sr44100_hl256",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json",
"param_name": "1band_sr44100_hl256"
},
{
"hash_name": "1band_sr44100_hl512",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
"param_name": "1band_sr44100_hl512"
},
{
"hash_name": "1band_sr44100_hl1024",
"model_params": "uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json",
"param_name": "1band_sr44100_hl1024"
}
],
"2 Band": [
{
"hash_name": "2band_44100_lofi",
"model_params": "uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json",
"param_name": "2band_44100_lofi"
},
{
"hash_name": "2band_32000",
"model_params": "uvr5_pack/lib_v5/modelparams/2band_32000.json",
"param_name": "2band_32000"
},
{
"hash_name": "2band_48000",
"model_params": "uvr5_pack/lib_v5/modelparams/2band_48000.json",
"param_name": "2band_48000"
}
],
"3 Band": [
{
"hash_name": "3band_44100",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100.json",
"param_name": "3band_44100"
},
{
"hash_name": "3band_44100_mid",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
"param_name": "3band_44100_mid"
},
{
"hash_name": "3band_44100_msb2",
"model_params": "uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
"param_name": "3band_44100_msb2"
}
],
"4 Band": [
{
"hash_name": "4band_44100",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100.json",
"param_name": "4band_44100"
},
{
"hash_name": "4band_44100_mid",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_mid.json",
"param_name": "4band_44100_mid"
},
{
"hash_name": "4band_44100_msb",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_msb.json",
"param_name": "4band_44100_msb"
},
{
"hash_name": "4band_44100_msb2",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json",
"param_name": "4band_44100_msb2"
},
{
"hash_name": "4band_44100_reverse",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json",
"param_name": "4band_44100_reverse"
},
{
"hash_name": "4band_44100_sw",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_44100_sw.json",
"param_name": "4band_44100_sw"
},
{
"hash_name": "4band_v2",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_v2.json",
"param_name": "4band_v2"
},
{
"hash_name": "4band_v2_sn",
"model_params": "uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
"param_name": "4band_v2_sn"
},
{
"hash_name": "tmodelparam",
"model_params": "uvr5_pack/lib_v5/modelparams/tmodelparam.json",
"param_name": "User Model Param Set"
}
]
}
]
}

View File

@@ -1,6 +1,15 @@
import torch
import numpy as np
from tqdm import tqdm
import json
def load_data(file_name: str = "./uvr5_pack/data.json") -> dict:
with open(file_name, "r") as f:
data = json.load(f)
return data
def make_padding(width, cropsize, offset):
left = offset
@@ -10,233 +19,102 @@ def make_padding(width, cropsize, offset):
right = roi_size - (width % roi_size) + left
return left, right, roi_size
def inference(X_spec, device, model, aggressiveness,data):
'''
def inference(X_spec, device, model, aggressiveness, data):
"""
data dic configs
'''
def _execute(X_mag_pad, roi_size, n_window, device, model, aggressiveness,is_half=True):
"""
def _execute(
X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True
):
model.eval()
with torch.no_grad():
preds = []
iterations = [n_window]
total_iterations = sum(iterations)
for i in tqdm(range(n_window)):
total_iterations = sum(iterations)
for i in tqdm(range(n_window)):
start = i * roi_size
X_mag_window = X_mag_pad[None, :, :, start:start + data['window_size']]
X_mag_window = X_mag_pad[
None, :, :, start : start + data["window_size"]
]
X_mag_window = torch.from_numpy(X_mag_window)
if(is_half):X_mag_window=X_mag_window.half()
X_mag_window=X_mag_window.to(device)
if is_half:
X_mag_window = X_mag_window.half()
X_mag_window = X_mag_window.to(device)
pred = model.predict(X_mag_window, aggressiveness)
pred = pred.detach().cpu().numpy()
preds.append(pred[0])
pred = np.concatenate(preds, axis=2)
return pred
def preprocess(X_spec):
X_mag = np.abs(X_spec)
X_phase = np.angle(X_spec)
return X_mag, X_phase
X_mag, X_phase = preprocess(X_spec)
coef = X_mag.max()
X_mag_pre = X_mag / coef
n_frame = X_mag_pre.shape[2]
pad_l, pad_r, roi_size = make_padding(n_frame,
data['window_size'], model.offset)
pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset)
n_window = int(np.ceil(n_frame / roi_size))
X_mag_pad = np.pad(
X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant')
X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
if(list(model.state_dict().values())[0].dtype==torch.float16):is_half=True
else:is_half=False
pred = _execute(X_mag_pad, roi_size, n_window,
device, model, aggressiveness,is_half)
if list(model.state_dict().values())[0].dtype == torch.float16:
is_half = True
else:
is_half = False
pred = _execute(
X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
)
pred = pred[:, :, :n_frame]
if data['tta']:
if data["tta"]:
pad_l += roi_size // 2
pad_r += roi_size // 2
n_window += 1
X_mag_pad = np.pad(
X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode='constant')
X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
pred_tta = _execute(X_mag_pad, roi_size, n_window,
device, model, aggressiveness,is_half)
pred_tta = pred_tta[:, :, roi_size // 2:]
pred_tta = _execute(
X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
)
pred_tta = pred_tta[:, :, roi_size // 2 :]
pred_tta = pred_tta[:, :, :n_frame]
return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.j * X_phase)
return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase)
else:
return pred * coef, X_mag, np.exp(1.j * X_phase)
return pred * coef, X_mag, np.exp(1.0j * X_phase)
def _get_name_params(model_path , model_hash):
def _get_name_params(model_path, model_hash):
data = load_data()
flag = False
ModelName = model_path
if model_hash == '47939caf0cfe52a0e81442b85b971dfd':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if model_hash == '4e4ecb9764c50a8c414fee6e10395bbe':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2.json')
param_name_auto=str('4band_v2')
if model_hash == 'ca106edd563e034bde0bdec4bb7a4b36':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2.json')
param_name_auto=str('4band_v2')
if model_hash == 'e60a1e84803ce4efc0a6551206cc4b71':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if model_hash == 'a82f14e75892e55e994376edbf0c8435':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if model_hash == '6dd9eaa6f0420af9f1d403aaafa4cc06':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2_sn.json')
param_name_auto=str('4band_v2_sn')
if model_hash == '08611fb99bd59eaa79ad27c58d137727':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2_sn.json')
param_name_auto=str('4band_v2_sn')
if model_hash == '5c7bbca45a187e81abbbd351606164e5':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json')
param_name_auto=str('3band_44100_msb2')
if model_hash == 'd6b2cb685a058a091e5e7098192d3233':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json')
param_name_auto=str('3band_44100_msb2')
if model_hash == 'c1b9f38170a7c90e96f027992eb7c62b':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if model_hash == 'c3448ec923fa0edf3d03a19e633faa53':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if model_hash == '68aa2c8093d0080704b200d140f59e54':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100.json')
param_name_auto=str('3band_44100.json')
if model_hash == 'fdc83be5b798e4bd29fe00fe6600e147':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_mid.json')
param_name_auto=str('3band_44100_mid.json')
if model_hash == '2ce34bc92fd57f55db16b7a4def3d745':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_mid.json')
param_name_auto=str('3band_44100_mid.json')
if model_hash == '52fdca89576f06cf4340b74a4730ee5f':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100.json')
if model_hash == '41191165b05d38fc77f072fa9e8e8a30':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100.json')
if model_hash == '89e83b511ad474592689e562d5b1f80e':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_32000.json')
param_name_auto=str('2band_32000.json')
if model_hash == '0b954da81d453b716b114d6d7c95177f':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_32000.json')
param_name_auto=str('2band_32000.json')
for type in list(data):
for model in list(data[type][0]):
for i in range(len(data[type][0][model])):
if str(data[type][0][model][i]["hash_name"]) == model_hash:
flag = True
elif str(data[type][0][model][i]["hash_name"]) in ModelName:
flag = True
#v4 Models
if model_hash == '6a00461c51c2920fd68937d4609ed6c8':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json')
param_name_auto=str('1band_sr16000_hl512')
if model_hash == '0ab504864d20f1bd378fe9c81ef37140':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
param_name_auto=str('1band_sr32000_hl512')
if model_hash == '7dd21065bf91c10f7fccb57d7d83b07f':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
param_name_auto=str('1band_sr32000_hl512')
if model_hash == '80ab74d65e515caa3622728d2de07d23':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
param_name_auto=str('1band_sr32000_hl512')
if model_hash == 'edc115e7fc523245062200c00caa847f':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json')
param_name_auto=str('1band_sr33075_hl384')
if model_hash == '28063e9f6ab5b341c5f6d3c67f2045b7':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json')
param_name_auto=str('1band_sr33075_hl384')
if model_hash == 'b58090534c52cbc3e9b5104bad666ef2':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json')
param_name_auto=str('1band_sr44100_hl512')
if model_hash == '0cdab9947f1b0928705f518f3c78ea8f':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json')
param_name_auto=str('1band_sr44100_hl512')
if model_hash == 'ae702fed0238afb5346db8356fe25f13':
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json')
param_name_auto=str('1band_sr44100_hl1024')
#User Models
#1 Band
if '1band_sr16000_hl512' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json')
param_name_auto=str('1band_sr16000_hl512')
if '1band_sr32000_hl512' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json')
param_name_auto=str('1band_sr32000_hl512')
if '1band_sr33075_hl384' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json')
param_name_auto=str('1band_sr33075_hl384')
if '1band_sr44100_hl256' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json')
param_name_auto=str('1band_sr44100_hl256')
if '1band_sr44100_hl512' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json')
param_name_auto=str('1band_sr44100_hl512')
if '1band_sr44100_hl1024' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json')
param_name_auto=str('1band_sr44100_hl1024')
#2 Band
if '2band_44100_lofi' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json')
param_name_auto=str('2band_44100_lofi')
if '2band_32000' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_32000.json')
param_name_auto=str('2band_32000')
if '2band_48000' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/2band_48000.json')
param_name_auto=str('2band_48000')
#3 Band
if '3band_44100' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100.json')
param_name_auto=str('3band_44100')
if '3band_44100_mid' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_mid.json')
param_name_auto=str('3band_44100_mid')
if '3band_44100_msb2' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json')
param_name_auto=str('3band_44100_msb2')
#4 Band
if '4band_44100' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100.json')
param_name_auto=str('4band_44100')
if '4band_44100_mid' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_mid.json')
param_name_auto=str('4band_44100_mid')
if '4band_44100_msb' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_msb.json')
param_name_auto=str('4band_44100_msb')
if '4band_44100_msb2' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json')
param_name_auto=str('4band_44100_msb2')
if '4band_44100_reverse' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json')
param_name_auto=str('4band_44100_reverse')
if '4band_44100_sw' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_44100_sw.json')
param_name_auto=str('4band_44100_sw')
if '4band_v2' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2.json')
param_name_auto=str('4band_v2')
if '4band_v2_sn' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/4band_v2_sn.json')
param_name_auto=str('4band_v2_sn')
if 'tmodelparam' in ModelName:
model_params_auto=str('uvr5_pack/lib_v5/modelparams/tmodelparam.json')
param_name_auto=str('User Model Param Set')
return param_name_auto , model_params_auto
if flag:
model_params_auto = data[type][0][model][i]["model_params"]
param_name_auto = data[type][0][model][i]["param_name"]
if type == "equivalent":
return param_name_auto, model_params_auto
else:
flag = False
return param_name_auto, model_params_auto