2023-04-27 23:34:03 +08:00
import numpy as np , parselmouth , torch , pdb
from time import time as ttime
import torch . nn . functional as F
2023-05-03 10:58:42 +10:00
import torchcrepe # Fork feature. Use the crepe f0 algorithm. New dependency (pip install torchcrepe)
2023-05-14 03:29:52 +10:00
from torch import Tensor
2023-04-27 23:34:03 +08:00
import scipy . signal as signal
2023-05-28 16:06:11 +00:00
import pyworld , os , traceback , faiss , librosa , torchcrepe
2023-04-27 23:34:03 +08:00
from scipy import signal
2023-05-13 03:29:30 +08:00
from functools import lru_cache
2023-04-27 23:34:03 +08:00
bh , ah = signal . butter ( N = 5 , Wn = 48 , btype = " high " , fs = 16000 )
2023-05-12 19:43:05 +00:00
input_audio_path2wav = { }
2023-05-13 03:29:30 +08:00
@lru_cache
2023-05-12 19:43:05 +00:00
def cache_harvest_f0 ( input_audio_path , fs , f0max , f0min , frame_period ) :
audio = input_audio_path2wav [ input_audio_path ]
2023-05-13 03:29:30 +08:00
f0 , t = pyworld . harvest (
audio ,
fs = fs ,
f0_ceil = f0max ,
f0_floor = f0min ,
frame_period = frame_period ,
)
f0 = pyworld . stonemask ( audio , f0 , t , fs )
return f0
2023-04-27 23:34:03 +08:00
2023-05-12 19:43:05 +00:00
2023-05-14 07:52:36 +00:00
def change_rms ( data1 , sr1 , data2 , sr2 , rate ) : # 1是输入音频, 2是输出音频,rate是2的占比
2023-05-14 15:05:42 +08:00
# print(data1.max(),data2.max())
2023-05-14 07:52:36 +00:00
rms1 = librosa . feature . rms (
y = data1 , frame_length = sr1 / / 2 * 2 , hop_length = sr1 / / 2
) # 每半秒一个点
rms2 = librosa . feature . rms ( y = data2 , frame_length = sr2 / / 2 * 2 , hop_length = sr2 / / 2 )
rms1 = torch . from_numpy ( rms1 )
rms1 = F . interpolate (
rms1 . unsqueeze ( 0 ) , size = data2 . shape [ 0 ] , mode = " linear "
) . squeeze ( )
rms2 = torch . from_numpy ( rms2 )
rms2 = F . interpolate (
rms2 . unsqueeze ( 0 ) , size = data2 . shape [ 0 ] , mode = " linear "
) . squeeze ( )
rms2 = torch . max ( rms2 , torch . zeros_like ( rms2 ) + 1e-6 )
data2 * = (
torch . pow ( rms1 , torch . tensor ( 1 - rate ) )
* torch . pow ( rms2 , torch . tensor ( rate - 1 ) )
) . numpy ( )
2023-05-14 15:05:42 +08:00
return data2
2023-05-12 19:43:05 +00:00
2023-05-14 07:52:36 +00:00
2023-04-27 23:34:03 +08:00
class VC ( object ) :
2023-04-28 21:43:02 +09:00
def __init__ ( self , tgt_sr , config ) :
self . x_pad , self . x_query , self . x_center , self . x_max , self . is_half = (
config . x_pad ,
config . x_query ,
config . x_center ,
config . x_max ,
2023-04-28 20:45:21 +08:00
config . is_half ,
2023-04-28 21:43:02 +09:00
)
2023-04-27 23:34:03 +08:00
self . sr = 16000 # hubert输入采样率
self . window = 160 # 每帧点数
2023-04-28 21:43:02 +09:00
self . t_pad = self . sr * self . x_pad # 每条前后pad时间
self . t_pad_tgt = tgt_sr * self . x_pad
2023-04-27 23:34:03 +08:00
self . t_pad2 = self . t_pad * 2
2023-04-28 21:43:02 +09:00
self . t_query = self . sr * self . x_query # 查询切点前后查询时间
self . t_center = self . sr * self . x_center # 查询切点位置
self . t_max = self . sr * self . x_max # 免查询时长阈值
self . device = config . device
2023-04-27 23:34:03 +08:00
2023-05-14 03:41:15 +10:00
# Fork Feature: Get the best torch device to use for f0 algorithms that require a torch device. Will return the type (torch.device)
def get_optimal_torch_device ( self , index : int = 0 ) - > torch . device :
# Get cuda device
if torch . cuda . is_available ( ) :
return torch . device ( f " cuda: { index % torch . cuda . device_count ( ) } " ) # Very fast
elif torch . backends . mps . is_available ( ) :
return torch . device ( " mps " )
# Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library
# Else wise return the "cpu" as a torch device,
return torch . device ( " cpu " )
2023-05-19 10:52:56 +10:00
# Fork Feature: Compute f0 with the crepe method
2023-05-03 10:58:42 +10:00
def get_f0_crepe_computation (
self ,
x ,
f0_min ,
f0_max ,
p_len ,
2023-05-30 18:52:16 +10:00
hop_length = 160 , # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time.
2023-05-03 10:58:42 +10:00
model = " full " , # Either use crepe-tiny "tiny" or crepe "full". Default is full
) :
x = x . astype ( np . float32 ) # fixes the F.conv2D exception. We needed to convert double to float.
x / = np . quantile ( np . abs ( x ) , 0.999 )
torch_device = self . get_optimal_torch_device ( )
audio = torch . from_numpy ( x ) . to ( torch_device , copy = True )
audio = torch . unsqueeze ( audio , dim = 0 )
if audio . ndim == 2 and audio . shape [ 0 ] > 1 :
audio = torch . mean ( audio , dim = 0 , keepdim = True ) . detach ( )
audio = audio . detach ( )
print ( " Initiating prediction with a crepe_hop_length of: " + str ( hop_length ) )
pitch : Tensor = torchcrepe . predict (
audio ,
self . sr ,
hop_length ,
f0_min ,
f0_max ,
model ,
2023-05-04 04:37:33 +10:00
batch_size = hop_length * 2 ,
2023-05-03 10:58:42 +10:00
device = torch_device ,
pad = True
)
p_len = p_len or x . shape [ 0 ] / / hop_length
2023-05-04 04:09:02 +10:00
# Resize the pitch for final f0
source = np . array ( pitch . squeeze ( 0 ) . cpu ( ) . float ( ) . numpy ( ) )
source [ source < 0.001 ] = np . nan
target = np . interp (
np . arange ( 0 , len ( source ) * p_len , len ( source ) ) / p_len ,
np . arange ( 0 , len ( source ) ) ,
source
)
f0 = np . nan_to_num ( target )
return f0 # Resized f0
2023-05-20 16:09:51 +10:00
2023-05-30 18:52:16 +10:00
def get_f0_official_crepe_computation (
self ,
x ,
f0_min ,
f0_max ,
hop_length = 160 ,
model = " full " ,
) :
# Pick a batch size that doesn't cause memory errors on your gpu
batch_size = 512
# Compute pitch using first gpu
audio = torch . tensor ( np . copy ( x ) ) [ None ] . float ( )
f0 , pd = torchcrepe . predict (
audio ,
self . sr ,
hop_length ,
f0_min ,
f0_max ,
model ,
batch_size = batch_size ,
device = self . device ,
return_periodicity = True ,
)
pd = torchcrepe . filter . median ( pd , 3 )
f0 = torchcrepe . filter . mean ( f0 , 3 )
f0 [ pd < 0.1 ] = 0
f0 = f0 [ 0 ] . cpu ( ) . numpy ( )
2023-05-25 20:08:23 +10:00
# Fork Feature: Compute pYIN f0 method
def get_f0_pyin_computation ( self , x , f0_min , f0_max ) :
y , sr = librosa . load ( ' saudio/Sidney.wav ' , self . sr , mono = True )
f0 , _ , _ = librosa . pyin ( y , sr = self . sr , fmin = f0_min , fmax = f0_max )
f0 = f0 [ 1 : ] # Get rid of extra first frame
return f0
2023-05-20 16:09:51 +10:00
# Fork Feature: Acquire median hybrid f0 estimation calculation
def get_f0_hybrid_computation (
self ,
methods_str ,
input_audio_path ,
x ,
f0_min ,
f0_max ,
p_len ,
filter_radius ,
crepe_hop_length ,
time_step ,
) :
# Get various f0 methods from input to use in the computation stack
s = methods_str
s = s . split ( ' hybrid ' ) [ 1 ]
s = s . replace ( ' [ ' , ' ' ) . replace ( ' ] ' , ' ' )
methods = s . split ( ' + ' )
f0_computation_stack = [ ]
2023-05-03 10:58:42 +10:00
2023-05-20 16:09:51 +10:00
print ( " Calculating f0 pitch estimations for methods: %s " % str ( methods ) )
x = x . astype ( np . float32 )
x / = np . quantile ( np . abs ( x ) , 0.999 )
# Get f0 calculations for all methods specified
for method in methods :
f0 = None
if method == " pm " :
f0 = (
parselmouth . Sound ( x , self . sr )
. to_pitch_ac (
time_step = time_step / 1000 ,
voicing_threshold = 0.6 ,
pitch_floor = f0_min ,
pitch_ceiling = f0_max ,
)
. selected_array [ " frequency " ]
)
pad_size = ( p_len - len ( f0 ) + 1 ) / / 2
if pad_size > 0 or p_len - len ( f0 ) - pad_size > 0 :
f0 = np . pad (
f0 , [ [ pad_size , p_len - len ( f0 ) - pad_size ] ] , mode = " constant "
)
elif method == " crepe " :
2023-05-30 18:52:16 +10:00
f0 = self . get_f0_official_crepe_computation ( x , f0_min , f0_max , crepe_hop_length )
2023-05-20 16:09:51 +10:00
elif method == " crepe-tiny " :
2023-05-30 18:52:16 +10:00
f0 = self . get_f0_official_crepe_computation ( x , f0_min , f0_max , crepe_hop_length , " tiny " )
elif method == " mangio-crepe " :
f0 = self . get_f0_crepe_computation ( x , f0_min , f0_max , p_len , crepe_hop_length )
elif method == " mangio-crepe-tiny " :
2023-05-20 16:09:51 +10:00
f0 = self . get_f0_crepe_computation ( x , f0_min , f0_max , p_len , crepe_hop_length , " tiny " )
elif method == " harvest " :
f0 = cache_harvest_f0 ( input_audio_path , self . sr , f0_max , f0_min , 10 )
if filter_radius > 2 :
f0 = signal . medfilt ( f0 , 3 )
2023-05-20 18:05:14 +10:00
f0 = f0 [ 1 : ] # Get rid of first frame.
2023-05-25 21:06:37 +10:00
elif method == " dio " : # Potentially buggy?
f0 , t = pyworld . dio (
x . astype ( np . double ) ,
fs = self . sr ,
f0_ceil = f0_max ,
f0_floor = f0_min ,
frame_period = 10
)
f0 = pyworld . stonemask ( x . astype ( np . double ) , f0 , t , self . sr )
f0 = signal . medfilt ( f0 , 3 )
f0 = f0 [ 1 : ]
2023-05-25 20:08:23 +10:00
#elif method == "pyin": Not Working just yet
# f0 = self.get_f0_pyin_computation(x, f0_min, f0_max)
2023-05-20 16:09:51 +10:00
# Push method to the stack
f0_computation_stack . append ( f0 )
2023-05-20 18:05:14 +10:00
for fc in f0_computation_stack :
print ( len ( fc ) )
2023-05-20 16:09:51 +10:00
print ( " Calculating hybrid median f0 from the stack of: %s " % str ( methods ) )
2023-05-25 21:06:37 +10:00
f0_median_hybrid = None
2023-05-25 21:10:19 +10:00
if len ( f0_computation_stack ) == 1 :
2023-05-25 21:06:37 +10:00
f0_median_hybrid = f0_computation_stack [ 0 ]
else :
f0_median_hybrid = np . nanmedian ( f0_computation_stack , axis = 0 )
2023-05-20 16:09:51 +10:00
return f0_median_hybrid
2023-05-19 10:52:56 +10:00
2023-05-12 19:43:05 +00:00
def get_f0 (
self ,
input_audio_path ,
x ,
p_len ,
f0_up_key ,
f0_method ,
filter_radius ,
2023-05-14 03:29:52 +10:00
crepe_hop_length ,
2023-05-12 19:43:05 +00:00
inp_f0 = None ,
) :
2023-05-13 03:29:30 +08:00
global input_audio_path2wav
2023-05-03 10:58:42 +10:00
time_step = self . window / self . sr * 1000
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np . log ( 1 + f0_min / 700 )
f0_mel_max = 1127 * np . log ( 1 + f0_max / 700 )
if f0_method == " pm " :
2023-04-27 23:34:03 +08:00
f0 = (
parselmouth . Sound ( x , self . sr )
. to_pitch_ac (
time_step = time_step / 1000 ,
voicing_threshold = 0.6 ,
pitch_floor = f0_min ,
pitch_ceiling = f0_max ,
)
. selected_array [ " frequency " ]
)
pad_size = ( p_len - len ( f0 ) + 1 ) / / 2
if pad_size > 0 or p_len - len ( f0 ) - pad_size > 0 :
f0 = np . pad (
f0 , [ [ pad_size , p_len - len ( f0 ) - pad_size ] ] , mode = " constant "
)
2023-05-03 10:58:42 +10:00
elif f0_method == " harvest " :
2023-05-12 19:43:05 +00:00
input_audio_path2wav [ input_audio_path ] = x . astype ( np . double )
f0 = cache_harvest_f0 ( input_audio_path , self . sr , f0_max , f0_min , 10 )
if filter_radius > 2 :
2023-05-13 03:29:30 +08:00
f0 = signal . medfilt ( f0 , 3 )
2023-05-25 21:06:37 +10:00
elif f0_method == " dio " : # Potentially Buggy?
f0 , t = pyworld . dio (
x . astype ( np . double ) ,
fs = self . sr ,
f0_ceil = f0_max ,
f0_floor = f0_min ,
frame_period = 10
)
f0 = pyworld . stonemask ( x . astype ( np . double ) , f0 , t , self . sr )
f0 = signal . medfilt ( f0 , 3 )
2023-05-28 22:58:33 +08:00
elif f0_method == " crepe " :
2023-05-30 18:52:16 +10:00
f0 = self . get_f0_official_crepe_computation ( x , f0_min , f0_max , crepe_hop_length )
elif f0_method == " crepe-tiny " :
f0 = self . get_f0_official_crepe_computation ( x , f0_min , f0_max , crepe_hop_length , " tiny " )
elif f0_method == " mangio-crepe " :
2023-05-03 10:58:42 +10:00
f0 = self . get_f0_crepe_computation ( x , f0_min , f0_max , p_len , crepe_hop_length )
2023-05-30 18:52:16 +10:00
elif f0_method == " mangio-crepe-tiny " :
2023-05-07 00:07:10 +10:00
f0 = self . get_f0_crepe_computation ( x , f0_min , f0_max , p_len , crepe_hop_length , " tiny " )
2023-05-20 16:09:51 +10:00
elif " hybrid " in f0_method :
# Perform hybrid median pitch estimation
input_audio_path2wav [ input_audio_path ] = x . astype ( np . double )
f0 = self . get_f0_hybrid_computation (
f0_method ,
input_audio_path ,
x ,
f0_min ,
f0_max ,
p_len ,
filter_radius ,
crepe_hop_length ,
time_step
)
2023-05-03 10:58:42 +10:00
2023-04-27 23:34:03 +08:00
f0 * = pow ( 2 , f0_up_key / 12 )
# with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
tf0 = self . sr / / self . window # 每秒f0点数
if inp_f0 is not None :
delta_t = np . round (
( inp_f0 [ : , 0 ] . max ( ) - inp_f0 [ : , 0 ] . min ( ) ) * tf0 + 1
) . astype ( " int16 " )
replace_f0 = np . interp (
list ( range ( delta_t ) ) , inp_f0 [ : , 0 ] * 100 , inp_f0 [ : , 1 ]
)
2023-04-28 21:43:02 +09:00
shape = f0 [ self . x_pad * tf0 : self . x_pad * tf0 + len ( replace_f0 ) ] . shape [ 0 ]
f0 [ self . x_pad * tf0 : self . x_pad * tf0 + len ( replace_f0 ) ] = replace_f0 [
: shape
]
2023-04-27 23:34:03 +08:00
# with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
f0bak = f0 . copy ( )
f0_mel = 1127 * np . log ( 1 + f0 / 700 )
f0_mel [ f0_mel > 0 ] = ( f0_mel [ f0_mel > 0 ] - f0_mel_min ) * 254 / (
f0_mel_max - f0_mel_min
) + 1
f0_mel [ f0_mel < = 1 ] = 1
f0_mel [ f0_mel > 255 ] = 255
f0_coarse = np . rint ( f0_mel ) . astype ( np . int )
2023-05-03 10:58:42 +10:00
2023-04-27 23:34:03 +08:00
return f0_coarse , f0bak # 1-0
def vc (
self ,
model ,
net_g ,
sid ,
audio0 ,
pitch ,
pitchf ,
times ,
index ,
big_npy ,
index_rate ,
2023-05-14 15:05:42 +08:00
version ,
2023-05-28 16:06:11 +00:00
protect ,
2023-04-27 23:34:03 +08:00
) : # ,file_index,file_big_npy
feats = torch . from_numpy ( audio0 )
if self . is_half :
feats = feats . half ( )
else :
feats = feats . float ( )
if feats . dim ( ) == 2 : # double channels
feats = feats . mean ( - 1 )
assert feats . dim ( ) == 1 , feats . dim ( )
feats = feats . view ( 1 , - 1 )
padding_mask = torch . BoolTensor ( feats . shape ) . to ( self . device ) . fill_ ( False )
inputs = {
" source " : feats . to ( self . device ) ,
" padding_mask " : padding_mask ,
2023-05-14 07:52:36 +00:00
" output_layer " : 9 if version == " v1 " else 12 ,
2023-04-27 23:34:03 +08:00
}
t0 = ttime ( )
with torch . no_grad ( ) :
logits = model . extract_features ( * * inputs )
2023-05-14 07:52:36 +00:00
feats = model . final_proj ( logits [ 0 ] ) if version == " v1 " else logits [ 0 ]
2023-05-28 16:06:11 +00:00
if protect < 0.5 :
feats0 = feats . clone ( )
2023-04-27 23:34:03 +08:00
if (
isinstance ( index , type ( None ) ) == False
and isinstance ( big_npy , type ( None ) ) == False
and index_rate != 0
) :
npy = feats [ 0 ] . cpu ( ) . numpy ( )
if self . is_half :
npy = npy . astype ( " float32 " )
# _, I = index.search(npy, 1)
# npy = big_npy[I.squeeze()]
score , ix = index . search ( npy , k = 8 )
weight = np . square ( 1 / score )
weight / = weight . sum ( axis = 1 , keepdims = True )
npy = np . sum ( big_npy [ ix ] * np . expand_dims ( weight , axis = 2 ) , axis = 1 )
if self . is_half :
npy = npy . astype ( " float16 " )
feats = (
torch . from_numpy ( npy ) . unsqueeze ( 0 ) . to ( self . device ) * index_rate
+ ( 1 - index_rate ) * feats
)
feats = F . interpolate ( feats . permute ( 0 , 2 , 1 ) , scale_factor = 2 ) . permute ( 0 , 2 , 1 )
2023-05-28 16:06:11 +00:00
if protect < 0.5 :
feats0 = F . interpolate ( feats0 . permute ( 0 , 2 , 1 ) , scale_factor = 2 ) . permute (
0 , 2 , 1
)
2023-04-27 23:34:03 +08:00
t1 = ttime ( )
p_len = audio0 . shape [ 0 ] / / self . window
if feats . shape [ 1 ] < p_len :
p_len = feats . shape [ 1 ]
if pitch != None and pitchf != None :
pitch = pitch [ : , : p_len ]
pitchf = pitchf [ : , : p_len ]
2023-05-28 22:58:33 +08:00
2023-05-28 16:06:11 +00:00
if protect < 0.5 :
2023-05-28 22:58:33 +08:00
pitchff = pitchf . clone ( )
pitchff [ pitchf > 0 ] = 1
pitchff [ pitchf < 1 ] = protect
pitchff = pitchff . unsqueeze ( - 1 )
feats = feats * pitchff + feats0 * ( 1 - pitchff )
2023-05-28 16:06:11 +00:00
feats = feats . to ( feats0 . dtype )
2023-04-27 23:34:03 +08:00
p_len = torch . tensor ( [ p_len ] , device = self . device ) . long ( )
with torch . no_grad ( ) :
if pitch != None and pitchf != None :
audio1 = (
2023-05-14 15:05:42 +08:00
( net_g . infer ( feats , p_len , pitch , pitchf , sid ) [ 0 ] [ 0 , 0 ] )
2023-04-27 23:34:03 +08:00
. data . cpu ( )
. float ( )
. numpy ( )
)
else :
audio1 = (
2023-05-14 07:52:36 +00:00
( net_g . infer ( feats , p_len , sid ) [ 0 ] [ 0 , 0 ] ) . data . cpu ( ) . float ( ) . numpy ( )
2023-04-27 23:34:03 +08:00
)
del feats , p_len , padding_mask
if torch . cuda . is_available ( ) :
torch . cuda . empty_cache ( )
t2 = ttime ( )
times [ 0 ] + = t1 - t0
times [ 2 ] + = t2 - t1
return audio1
def pipeline (
self ,
model ,
net_g ,
sid ,
audio ,
2023-05-13 03:29:30 +08:00
input_audio_path ,
2023-04-27 23:34:03 +08:00
times ,
f0_up_key ,
f0_method ,
file_index ,
# file_big_npy,
index_rate ,
if_f0 ,
2023-05-13 03:29:30 +08:00
filter_radius ,
tgt_sr ,
resample_sr ,
2023-05-14 15:05:42 +08:00
rms_mix_rate ,
version ,
2023-05-28 22:58:33 +08:00
protect ,
2023-05-03 10:58:42 +10:00
crepe_hop_length ,
2023-04-27 23:34:03 +08:00
f0_file = None ,
) :
if (
file_index != " "
# and file_big_npy != ""
# and os.path.exists(file_big_npy) == True
and os . path . exists ( file_index ) == True
and index_rate != 0
) :
try :
index = faiss . read_index ( file_index )
# big_npy = np.load(file_big_npy)
big_npy = index . reconstruct_n ( 0 , index . ntotal )
except :
traceback . print_exc ( )
index = big_npy = None
else :
index = big_npy = None
audio = signal . filtfilt ( bh , ah , audio )
audio_pad = np . pad ( audio , ( self . window / / 2 , self . window / / 2 ) , mode = " reflect " )
opt_ts = [ ]
if audio_pad . shape [ 0 ] > self . t_max :
audio_sum = np . zeros_like ( audio )
for i in range ( self . window ) :
audio_sum + = audio_pad [ i : i - self . window ]
for t in range ( self . t_center , audio . shape [ 0 ] , self . t_center ) :
opt_ts . append (
t
- self . t_query
+ np . where (
np . abs ( audio_sum [ t - self . t_query : t + self . t_query ] )
== np . abs ( audio_sum [ t - self . t_query : t + self . t_query ] ) . min ( )
) [ 0 ] [ 0 ]
)
s = 0
audio_opt = [ ]
t = None
t1 = ttime ( )
audio_pad = np . pad ( audio , ( self . t_pad , self . t_pad ) , mode = " reflect " )
p_len = audio_pad . shape [ 0 ] / / self . window
inp_f0 = None
if hasattr ( f0_file , " name " ) == True :
try :
with open ( f0_file . name , " r " ) as f :
lines = f . read ( ) . strip ( " \n " ) . split ( " \n " )
inp_f0 = [ ]
for line in lines :
inp_f0 . append ( [ float ( i ) for i in line . split ( " , " ) ] )
inp_f0 = np . array ( inp_f0 , dtype = " float32 " )
except :
traceback . print_exc ( )
sid = torch . tensor ( sid , device = self . device ) . unsqueeze ( 0 ) . long ( )
pitch , pitchf = None , None
if if_f0 == 1 :
2023-05-12 19:43:05 +00:00
pitch , pitchf = self . get_f0 (
input_audio_path ,
audio_pad ,
p_len ,
f0_up_key ,
f0_method ,
filter_radius ,
2023-05-14 03:29:52 +10:00
crepe_hop_length ,
2023-05-12 19:43:05 +00:00
inp_f0 ,
)
2023-04-27 23:34:03 +08:00
pitch = pitch [ : p_len ]
pitchf = pitchf [ : p_len ]
2023-05-10 22:17:13 +09:00
if self . device == " mps " :
pitchf = pitchf . astype ( np . float32 )
2023-04-27 23:34:03 +08:00
pitch = torch . tensor ( pitch , device = self . device ) . unsqueeze ( 0 ) . long ( )
pitchf = torch . tensor ( pitchf , device = self . device ) . unsqueeze ( 0 ) . float ( )
t2 = ttime ( )
times [ 1 ] + = t2 - t1
for t in opt_ts :
t = t / / self . window * self . window
if if_f0 == 1 :
audio_opt . append (
self . vc (
model ,
net_g ,
sid ,
audio_pad [ s : t + self . t_pad2 + self . window ] ,
pitch [ : , s / / self . window : ( t + self . t_pad2 ) / / self . window ] ,
pitchf [ : , s / / self . window : ( t + self . t_pad2 ) / / self . window ] ,
times ,
index ,
big_npy ,
index_rate ,
2023-05-14 15:05:42 +08:00
version ,
2023-05-28 16:06:11 +00:00
protect ,
2023-04-27 23:34:03 +08:00
) [ self . t_pad_tgt : - self . t_pad_tgt ]
)
else :
audio_opt . append (
self . vc (
model ,
net_g ,
sid ,
audio_pad [ s : t + self . t_pad2 + self . window ] ,
None ,
None ,
times ,
index ,
big_npy ,
index_rate ,
2023-05-14 15:05:42 +08:00
version ,
2023-05-28 16:06:11 +00:00
protect ,
2023-04-27 23:34:03 +08:00
) [ self . t_pad_tgt : - self . t_pad_tgt ]
)
s = t
if if_f0 == 1 :
audio_opt . append (
self . vc (
model ,
net_g ,
sid ,
audio_pad [ t : ] ,
pitch [ : , t / / self . window : ] if t is not None else pitch ,
pitchf [ : , t / / self . window : ] if t is not None else pitchf ,
times ,
index ,
big_npy ,
index_rate ,
2023-05-14 15:05:42 +08:00
version ,
2023-05-28 16:06:11 +00:00
protect ,
2023-04-27 23:34:03 +08:00
) [ self . t_pad_tgt : - self . t_pad_tgt ]
)
else :
audio_opt . append (
self . vc (
model ,
net_g ,
sid ,
audio_pad [ t : ] ,
None ,
None ,
times ,
index ,
big_npy ,
index_rate ,
2023-05-14 15:05:42 +08:00
version ,
2023-05-28 16:06:11 +00:00
protect ,
2023-04-27 23:34:03 +08:00
) [ self . t_pad_tgt : - self . t_pad_tgt ]
)
audio_opt = np . concatenate ( audio_opt )
2023-05-14 07:52:36 +00:00
if rms_mix_rate != 1 :
audio_opt = change_rms ( audio , 16000 , audio_opt , tgt_sr , rms_mix_rate )
2023-05-12 19:43:05 +00:00
if resample_sr > = 16000 and tgt_sr != resample_sr :
2023-05-13 03:29:30 +08:00
audio_opt = librosa . resample (
audio_opt , orig_sr = tgt_sr , target_sr = resample_sr
)
2023-05-14 07:52:36 +00:00
audio_max = np . abs ( audio_opt ) . max ( ) / 0.99
max_int16 = 32768
if audio_max > 1 :
max_int16 / = audio_max
audio_opt = ( audio_opt * max_int16 ) . astype ( np . int16 )
2023-04-27 23:34:03 +08:00
del pitch , pitchf , sid
if torch . cuda . is_available ( ) :
torch . cuda . empty_cache ( )
return audio_opt