This commit is contained in:
Rongjiehuang
2023-04-09 17:02:38 +08:00
parent e8fdbbfc81
commit f3cf2be08c
8 changed files with 11 additions and 714 deletions

0
LICENSE Normal file
View File

View File

@@ -1,9 +1 @@
---
title: DiffSinger🎶 Diffusion for Singing Voice Synthesis
emoji: 🎶
colorFrom: purple
colorTo: blue
sdk: gradio
app_file: "inference/svs/gradio/infer.py"
pinned: false
---
In this directory, we support FastSpeech, GenerSpeech, SyntaSpeech, DiffSinger

View File

@@ -1,188 +0,0 @@
from utils.hparams import hparams
import torch
from torch import nn
import torch.nn.functional as F
from modules.commons.conv import TextConvEncoder, ConvBlocks
from modules.commons.common_layers import Embedding
from modules.fastspeech.tts_modules import LayerNorm, PitchPredictor, LengthRegulator
from modules.commons.rel_transformer import RelTransformerEncoder, BERTRelTransformerEncoder
from modules.commons.align_ops import clip_mel2token_to_multiple, expand_states
from utils.pitch_utils import denorm_f0, f0_to_coarse
FS_ENCODERS = {
'rel_fft': lambda hp, dict: RelTransformerEncoder(
len(dict), hp['hidden_size'], hp['hidden_size'],
hp['ffn_hidden_size'], hp['num_heads'], hp['enc_layers'],
hp['enc_ffn_kernel_size'], hp['dropout'], prenet=hp['enc_prenet'], pre_ln=hp['enc_pre_ln']),
}
FS_DECODERS = {
'conv': lambda hp: ConvBlocks(hp['hidden_size'], hp['hidden_size'], hp['dec_dilations'],
hp['dec_kernel_size'], layers_in_block=hp['layers_in_block'],
norm_type=hp['enc_dec_norm'], dropout=hp['dropout'],
post_net_kernel=hp.get('dec_post_net_kernel', 3)),
}
class DurationPredictor(torch.nn.Module):
def __init__(self, idim, n_layers=2, n_chans=384, kernel_size=3, dropout_rate=0.1, offset=1.0):
super(DurationPredictor, self).__init__()
self.offset = offset
self.conv = torch.nn.ModuleList()
self.kernel_size = kernel_size
for idx in range(n_layers):
in_chans = idim if idx == 0 else n_chans
self.conv += [torch.nn.Sequential(
torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2),
torch.nn.ReLU(),
LayerNorm(n_chans, dim=1),
torch.nn.Dropout(dropout_rate)
)]
self.linear = nn.Sequential(torch.nn.Linear(n_chans, 1), nn.Softplus())
def forward(self, x, x_padding=None):
x = x.transpose(1, -1) # (B, idim, Tmax)
for f in self.conv:
x = f(x) # (B, C, Tmax)
if x_padding is not None:
x = x * (1 - x_padding.float())[:, None, :]
x = self.linear(x.transpose(1, -1)) # [B, T, C]
x = x * (1 - x_padding.float())[:, :, None] # (B, T, C)
x = x[..., 0] # (B, Tmax)
return x
class FastSpeech(nn.Module):
def __init__(self, dict_size, out_dims=None):
super().__init__()
self.enc_layers = hparams['enc_layers']
self.dec_layers = hparams['dec_layers']
self.hidden_size = hparams['hidden_size']
if hparams.get("use_bert") is True:
self.ph_encoder = BERTRelTransformerEncoder(dict_size, hparams['hidden_size'], hparams['hidden_size'],
hparams['ffn_hidden_size'], hparams['num_heads'], hparams['enc_layers'],
hparams['enc_ffn_kernel_size'], hparams['dropout'], prenet=hparams['enc_prenet'], pre_ln=hparams['enc_pre_ln'])
else:
self.ph_encoder = FS_ENCODERS[hparams['encoder_type']](hparams, dict_size)
self.decoder = FS_DECODERS[hparams['decoder_type']](hparams)
self.out_dims = hparams['audio_num_mel_bins'] if out_dims is None else out_dims
self.mel_out = nn.Linear(self.hidden_size, self.out_dims, bias=True)
if hparams['use_spk_id']:
self.spk_id_proj = Embedding(hparams['num_spk'], self.hidden_size)
if hparams['use_spk_embed']:
self.spk_embed_proj = nn.Linear(256, self.hidden_size, bias=True)
predictor_hidden = hparams['predictor_hidden'] if hparams['predictor_hidden'] > 0 else self.hidden_size
self.dur_predictor = DurationPredictor(
self.hidden_size,
n_chans=predictor_hidden,
n_layers=hparams['dur_predictor_layers'],
dropout_rate=hparams['predictor_dropout'],
kernel_size=hparams['dur_predictor_kernel'])
self.length_regulator = LengthRegulator()
if hparams['use_pitch_embed']:
self.pitch_embed = Embedding(300, self.hidden_size, 0)
self.pitch_predictor = PitchPredictor(
self.hidden_size, n_chans=predictor_hidden,
n_layers=5, dropout_rate=0.1, odim=2,
kernel_size=hparams['predictor_kernel'])
if hparams['dec_inp_add_noise']:
self.z_channels = hparams['z_channels']
self.dec_inp_noise_proj = nn.Linear(self.hidden_size + self.z_channels, self.hidden_size)
def forward(self, txt_tokens, mel2ph=None, spk_embed=None, spk_id=None,
f0=None, uv=None, infer=False, **kwargs):
ret = {}
src_nonpadding = (txt_tokens > 0).float()[:, :, None]
style_embed = self.forward_style_embed(spk_embed, spk_id)
use_bert = hparams.get("use_bert") is True
if use_bert:
encoder_out = self.encoder(txt_tokens, bert_feats=kwargs['bert_feats'], ph2word=kwargs['ph2word'],
ret=ret) * src_nonpadding + style_embed
else:
encoder_out = self.encoder(txt_tokens) * src_nonpadding + style_embed
# add dur
dur_inp = (encoder_out + style_embed) * src_nonpadding
mel2ph = self.forward_dur(dur_inp, mel2ph, txt_tokens, ret)
tgt_nonpadding = (mel2ph > 0).float()[:, :, None]
decoder_inp = expand_states(encoder_out, mel2ph)
# add pitch embed
if hparams['use_pitch_embed']:
pitch_inp = (decoder_inp + style_embed) * tgt_nonpadding
decoder_inp = decoder_inp + self.forward_pitch(pitch_inp, f0, uv, mel2ph, ret, encoder_out)
# decoder input
ret['decoder_inp'] = decoder_inp = (decoder_inp + style_embed) * tgt_nonpadding
if hparams['dec_inp_add_noise']:
B, T, _ = decoder_inp.shape
z = kwargs.get('adv_z', torch.randn([B, T, self.z_channels])).to(decoder_inp.device)
ret['adv_z'] = z
decoder_inp = torch.cat([decoder_inp, z], -1)
decoder_inp = self.dec_inp_noise_proj(decoder_inp) * tgt_nonpadding
ret['mel_out'] = self.forward_decoder(decoder_inp, tgt_nonpadding, ret, infer=infer, **kwargs)
return ret
def forward_style_embed(self, spk_embed=None, spk_id=None):
# add spk embed
style_embed = 0
if hparams['use_spk_embed']:
style_embed = style_embed + self.spk_embed_proj(spk_embed)[:, None, :]
if hparams['use_spk_id']:
style_embed = style_embed + self.spk_id_proj(spk_id)[:, None, :]
return style_embed
def forward_dur(self, dur_input, mel2ph, txt_tokens, ret):
"""
:param dur_input: [B, T_txt, H]
:param mel2ph: [B, T_mel]
:param txt_tokens: [B, T_txt]
:param ret:
:return:
"""
src_padding = txt_tokens == 0
if hparams['predictor_grad'] != 1:
dur_input = dur_input.detach() + hparams['predictor_grad'] * (dur_input - dur_input.detach())
dur = self.dur_predictor(dur_input, src_padding)
ret['dur'] = dur
if mel2ph is None:
mel2ph = self.length_regulator(dur, src_padding).detach()
ret['mel2ph'] = mel2ph = clip_mel2token_to_multiple(mel2ph, hparams['frames_multiple'])
return mel2ph
def forward_pitch(self, decoder_inp, f0, uv, mel2ph, ret, encoder_out=None):
if hparams['pitch_type'] == 'frame':
pitch_pred_inp = decoder_inp
pitch_padding = mel2ph == 0
else:
pitch_pred_inp = encoder_out
pitch_padding = encoder_out.abs().sum(-1) == 0
uv = None
if hparams['predictor_grad'] != 1:
pitch_pred_inp = pitch_pred_inp.detach() + \
hparams['predictor_grad'] * (pitch_pred_inp - pitch_pred_inp.detach())
ret['pitch_pred'] = pitch_pred = self.pitch_predictor(pitch_pred_inp)
use_uv = hparams['pitch_type'] == 'frame' and hparams['use_uv']
if f0 is None:
f0 = pitch_pred[:, :, 0]
if use_uv:
uv = pitch_pred[:, :, 1] > 0
f0_denorm = denorm_f0(f0, uv if use_uv else None, pitch_padding=pitch_padding)
pitch = f0_to_coarse(f0_denorm) # start from 0 [B, T_txt]
ret['f0_denorm'] = f0_denorm
ret['f0_denorm_pred'] = denorm_f0(
pitch_pred[:, :, 0], (pitch_pred[:, :, 1] > 0) if use_uv else None,
pitch_padding=pitch_padding)
if hparams['pitch_type'] == 'ph':
pitch = torch.gather(F.pad(pitch, [1, 0]), 1, mel2ph)
ret['f0_denorm'] = torch.gather(F.pad(ret['f0_denorm'], [1, 0]), 1, mel2ph)
ret['f0_denorm_pred'] = torch.gather(F.pad(ret['f0_denorm_pred'], [1, 0]), 1, mel2ph)
pitch_embed = self.pitch_embed(pitch)
return pitch_embed
def forward_decoder(self, decoder_inp, tgt_nonpadding, ret, infer, **kwargs):
x = decoder_inp # [B, T, H]
x = self.decoder(x)
x = self.mel_out(x)
return x * tgt_nonpadding

View File

@@ -1,202 +0,0 @@
import numpy as np
import torch
import torch.distributions as dist
from torch import nn
from modules.commons.conv import ConditionalConvBlocks
from modules.commons.normalizing_flow.res_flow import ResFlow
from modules.commons.wavenet import WN
class FVAEEncoder(nn.Module):
def __init__(self, c_in, hidden_size, c_latent, kernel_size,
n_layers, c_cond=0, p_dropout=0, strides=[4], nn_type='wn'):
super().__init__()
self.strides = strides
self.hidden_size = hidden_size
if np.prod(strides) == 1:
self.pre_net = nn.Conv1d(c_in, hidden_size, kernel_size=1)
else:
self.pre_net = nn.Sequential(*[
nn.Conv1d(c_in, hidden_size, kernel_size=s * 2, stride=s, padding=s // 2)
if i == 0 else
nn.Conv1d(hidden_size, hidden_size, kernel_size=s * 2, stride=s, padding=s // 2)
for i, s in enumerate(strides)
])
if nn_type == 'wn':
self.nn = WN(hidden_size, kernel_size, 1, n_layers, c_cond, p_dropout)
elif nn_type == 'conv':
self.nn = ConditionalConvBlocks(
hidden_size, c_cond, hidden_size, None, kernel_size,
layers_in_block=2, is_BTC=False, num_layers=n_layers)
self.out_proj = nn.Conv1d(hidden_size, c_latent * 2, 1)
self.latent_channels = c_latent
def forward(self, x, nonpadding, cond):
x = self.pre_net(x)
nonpadding = nonpadding[:, :, ::np.prod(self.strides)][:, :, :x.shape[-1]]
x = x * nonpadding
x = self.nn(x, nonpadding=nonpadding, cond=cond) * nonpadding
x = self.out_proj(x)
m, logs = torch.split(x, self.latent_channels, dim=1)
z = (m + torch.randn_like(m) * torch.exp(logs))
return z, m, logs, nonpadding
class FVAEDecoder(nn.Module):
def __init__(self, c_latent, hidden_size, out_channels, kernel_size,
n_layers, c_cond=0, p_dropout=0, strides=[4], nn_type='wn'):
super().__init__()
self.strides = strides
self.hidden_size = hidden_size
self.pre_net = nn.Sequential(*[
nn.ConvTranspose1d(c_latent, hidden_size, kernel_size=s, stride=s)
if i == 0 else
nn.ConvTranspose1d(hidden_size, hidden_size, kernel_size=s, stride=s)
for i, s in enumerate(strides)
])
if nn_type == 'wn':
self.nn = WN(hidden_size, kernel_size, 1, n_layers, c_cond, p_dropout)
elif nn_type == 'conv':
self.nn = ConditionalConvBlocks(
hidden_size, c_cond, hidden_size, [1] * n_layers, kernel_size,
layers_in_block=2, is_BTC=False)
self.out_proj = nn.Conv1d(hidden_size, out_channels, 1)
def forward(self, x, nonpadding, cond):
x = self.pre_net(x)
x = x * nonpadding
x = self.nn(x, nonpadding=nonpadding, cond=cond) * nonpadding
x = self.out_proj(x)
return x
class FVAE(nn.Module):
def __init__(self,
c_in_out, hidden_size, c_latent,
kernel_size, enc_n_layers, dec_n_layers, c_cond, strides,
use_prior_flow, flow_hidden=None, flow_kernel_size=None, flow_n_steps=None,
encoder_type='wn', decoder_type='wn'):
super(FVAE, self).__init__()
self.strides = strides
self.hidden_size = hidden_size
self.latent_size = c_latent
self.use_prior_flow = use_prior_flow
if np.prod(strides) == 1:
self.g_pre_net = nn.Conv1d(c_cond, c_cond, kernel_size=1)
else:
self.g_pre_net = nn.Sequential(*[
nn.Conv1d(c_cond, c_cond, kernel_size=s * 2, stride=s, padding=s // 2)
for i, s in enumerate(strides)
])
self.encoder = FVAEEncoder(c_in_out, hidden_size, c_latent, kernel_size,
enc_n_layers, c_cond, strides=strides, nn_type=encoder_type)
if use_prior_flow:
self.prior_flow = ResFlow(
c_latent, flow_hidden, flow_kernel_size, flow_n_steps, 4, c_cond=c_cond)
self.decoder = FVAEDecoder(c_latent, hidden_size, c_in_out, kernel_size,
dec_n_layers, c_cond, strides=strides, nn_type=decoder_type)
self.prior_dist = dist.Normal(0, 1)
def forward(self, x=None, nonpadding=None, cond=None, infer=False, noise_scale=1.0, **kwargs):
"""
:param x: [B, C_in_out, T]
:param nonpadding: [B, 1, T]
:param cond: [B, C_g, T]
:return:
"""
if nonpadding is None:
nonpadding = 1
cond_sqz = self.g_pre_net(cond)
if not infer:
z_q, m_q, logs_q, nonpadding_sqz = self.encoder(x, nonpadding, cond_sqz)
q_dist = dist.Normal(m_q, logs_q.exp())
if self.use_prior_flow:
logqx = q_dist.log_prob(z_q)
z_p = self.prior_flow(z_q, nonpadding_sqz, cond_sqz)
logpx = self.prior_dist.log_prob(z_p)
loss_kl = ((logqx - logpx) * nonpadding_sqz).sum() / nonpadding_sqz.sum() / logqx.shape[1]
else:
loss_kl = torch.distributions.kl_divergence(q_dist, self.prior_dist)
loss_kl = (loss_kl * nonpadding_sqz).sum() / nonpadding_sqz.sum() / z_q.shape[1]
z_p = None
return z_q, loss_kl, z_p, m_q, logs_q
else:
latent_shape = [cond_sqz.shape[0], self.latent_size, cond_sqz.shape[2]]
z_p = torch.randn(latent_shape).to(cond.device) * noise_scale
if self.use_prior_flow:
z_p = self.prior_flow(z_p, 1, cond_sqz, reverse=True)
return z_p
class SyntaFVAE(nn.Module):
def __init__(self,
c_in_out, hidden_size, c_latent,
kernel_size, enc_n_layers, dec_n_layers, c_cond, strides,
use_prior_flow, flow_hidden=None, flow_kernel_size=None, flow_n_steps=None,
encoder_type='wn', decoder_type='wn'):
super(SyntaFVAE, self).__init__()
self.strides = strides
self.hidden_size = hidden_size
self.latent_size = c_latent
self.use_prior_flow = use_prior_flow
if np.prod(strides) == 1:
self.g_pre_net = nn.Conv1d(c_cond, c_cond, kernel_size=1)
else:
self.g_pre_net = nn.Sequential(*[
nn.Conv1d(c_cond, c_cond, kernel_size=s * 2, stride=s, padding=s // 2)
for i, s in enumerate(strides)
])
self.encoder = FVAEEncoder(c_in_out, hidden_size, c_latent, kernel_size,
enc_n_layers, c_cond, strides=strides, nn_type=encoder_type)
if use_prior_flow:
self.prior_flow = ResFlow(
c_latent, flow_hidden, flow_kernel_size, flow_n_steps, 4, c_cond=c_cond)
self.decoder = FVAEDecoder(c_latent, hidden_size, c_in_out, kernel_size,
dec_n_layers, c_cond, strides=strides, nn_type=decoder_type)
self.prior_dist = dist.Normal(0, 1)
self.graph_encoder = GraphAuxEnc(in_dim=hidden_size, hid_dim=hidden_size,out_dim=hidden_size)
def forward(self, x=None, nonpadding=None, cond=None, infer=False, noise_scale=1.0,
mel2word=None, ph2word=None, graph_lst=None, etypes_lst=None):
"""
:param x: target mel, [B, C_in_out, T]
:param nonpadding: [B, 1, T]
:param cond: phoneme encoding, [B, C_g, T]
:return:
"""
word_len = ph2word.max(dim=1)[0]
ph_encoding_for_graph = cond.detach() + 0.1 * (cond - cond.detach()) # only 0.1x grad can pass through
_, ph_out_word_encoding_for_graph = GraphAuxEnc.ph_encoding_to_word_encoding(ph_encoding_for_graph.transpose(1,2), mel2word, word_len)
t_m = mel2word.shape[-1]
g_graph = self.graph_encoder.word_forward(graph_lst=graph_lst, word_encoding=ph_out_word_encoding_for_graph, etypes_lst=etypes_lst)
g_graph = g_graph.transpose(1,2)
g_graph = GraphAuxEnc._postprocess_word2ph(g_graph,mel2word,t_m)
g_graph = g_graph.transpose(1,2)
cond = cond + g_graph * 1.
if nonpadding is None:
nonpadding = 1
cond_sqz = self.g_pre_net(cond)
if not infer:
z_q, m_q, logs_q, nonpadding_sqz = self.encoder(x, nonpadding, cond_sqz)
q_dist = dist.Normal(m_q, logs_q.exp())
if self.use_prior_flow:
logqx = q_dist.log_prob(z_q)
z_p = self.prior_flow(z_q, nonpadding_sqz, cond_sqz)
logpx = self.prior_dist.log_prob(z_p)
loss_kl = ((logqx - logpx) * nonpadding_sqz).sum() / nonpadding_sqz.sum() / logqx.shape[1]
else:
loss_kl = torch.distributions.kl_divergence(q_dist, self.prior_dist)
loss_kl = (loss_kl * nonpadding_sqz).sum() / nonpadding_sqz.sum() / z_q.shape[1]
z_p = None
return z_q, loss_kl, z_p, m_q, logs_q
else:
latent_shape = [cond_sqz.shape[0], self.latent_size, cond_sqz.shape[2]]
z_p = torch.randn(latent_shape).to(cond.device) * noise_scale
if self.use_prior_flow:
z_p = self.prior_flow(z_p, 1, cond_sqz, reverse=True)
return z_p

View File

@@ -1,230 +0,0 @@
import math
import torch
from torch import nn
from torch.nn import Linear
from modules.commons.conv import ConvBlocks, ConditionalConvBlocks
from modules.commons.common_layers import Embedding
from modules.commons.rel_transformer import RelTransformerEncoder
from modules.commons.transformer import MultiheadAttention, FFTBlocks
from modules.commons.align_ops import clip_mel2token_to_multiple, build_word_mask, expand_states, mel2ph_to_mel2word
from modules.portaspeech.fs import FS_DECODERS, FastSpeech
from modules.portaspeech.fvae import FVAE
from utils.tts_utils import group_hidden_by_segs
from utils.hparams import hparams
class SinusoidalPosEmb(nn.Module):
def __init__(self, dim):
super().__init__()
self.dim = dim
def forward(self, x):
"""
:param x: [B, T]
:return: [B, T, H]
"""
device = x.device
half_dim = self.dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
emb = x[:, :, None] * emb[None, :]
emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
return emb
class PortaSpeech(FastSpeech):
def __init__(self, ph_dictionary, word_dictionary, out_dims=None):
super().__init__(ph_dictionary, out_dims)
# build linguistic encoder
if hparams['use_word_encoder']:
# default False, use independent word embedding instead of phoneme encoding to represent word
self.word_encoder = RelTransformerEncoder(
len(word_dictionary), self.hidden_size, self.hidden_size, self.hidden_size, 2,
hparams['word_enc_layers'], hparams['enc_ffn_kernel_size'])
if hparams['dur_level'] == 'word':
if hparams['word_encoder_type'] == 'rel_fft':
self.ph2word_encoder = RelTransformerEncoder(
0, self.hidden_size, self.hidden_size, self.hidden_size, 2,
hparams['word_enc_layers'], hparams['enc_ffn_kernel_size'])
if hparams['word_encoder_type'] == 'fft':
self.ph2word_encoder = FFTBlocks(
self.hidden_size, hparams['word_enc_layers'], 1, num_heads=hparams['num_heads'])
self.sin_pos = SinusoidalPosEmb(self.hidden_size)
self.enc_pos_proj = nn.Linear(2 * self.hidden_size, self.hidden_size)
self.dec_query_proj = nn.Linear(2 * self.hidden_size, self.hidden_size)
self.dec_res_proj = nn.Linear(2 * self.hidden_size, self.hidden_size)
self.attn = MultiheadAttention(self.hidden_size, 1, encoder_decoder_attention=True, bias=False)
self.attn.enable_torch_version = False
if hparams['text_encoder_postnet']:
self.text_encoder_postnet = ConvBlocks(
self.hidden_size, self.hidden_size, [1] * 3, 5, layers_in_block=2)
else:
self.sin_pos = SinusoidalPosEmb(self.hidden_size)
# build VAE decoder
if hparams['use_fvae']:
del self.decoder
del self.mel_out
self.fvae = FVAE(
c_in_out=self.out_dims,
hidden_size=hparams['fvae_enc_dec_hidden'], c_latent=hparams['latent_size'],
kernel_size=hparams['fvae_kernel_size'],
enc_n_layers=hparams['fvae_enc_n_layers'],
dec_n_layers=hparams['fvae_dec_n_layers'],
c_cond=self.hidden_size,
use_prior_flow=hparams['use_prior_flow'],
flow_hidden=hparams['prior_flow_hidden'],
flow_kernel_size=hparams['prior_flow_kernel_size'],
flow_n_steps=hparams['prior_flow_n_blocks'],
strides=[hparams['fvae_strides']],
encoder_type=hparams['fvae_encoder_type'],
decoder_type=hparams['fvae_decoder_type'],
)
else:
self.decoder = FS_DECODERS[hparams['decoder_type']](hparams)
self.mel_out = Linear(self.hidden_size, self.out_dims, bias=True)
if hparams['use_pitch_embed']:
self.pitch_embed = Embedding(300, self.hidden_size, 0)
if hparams['add_word_pos']:
self.word_pos_proj = Linear(self.hidden_size, self.hidden_size)
def build_embedding(self, dictionary, embed_dim):
num_embeddings = len(dictionary)
emb = Embedding(num_embeddings, embed_dim, self.padding_idx)
return emb
def forward(self, txt_tokens, word_tokens, ph2word, word_len, mel2word=None, mel2ph=None,
spk_embed=None, spk_id=None, pitch=None, infer=False, tgt_mels=None,
global_step=None, *args, **kwargs):
ret = {}
style_embed = self.forward_style_embed(spk_embed, spk_id)
x, tgt_nonpadding = self.run_text_encoder(
txt_tokens, word_tokens, ph2word, word_len, mel2word, mel2ph, style_embed, ret, **kwargs)
x = x * tgt_nonpadding
ret['nonpadding'] = tgt_nonpadding
if hparams['use_pitch_embed']:
x = x + self.pitch_embed(pitch)
ret['decoder_inp'] = x
ret['mel_out_fvae'] = ret['mel_out'] = self.run_decoder(x, tgt_nonpadding, ret, infer, tgt_mels, global_step)
return ret
def run_text_encoder(self, txt_tokens, word_tokens, ph2word, word_len, mel2word, mel2ph, style_embed, ret, **kwargs):
word2word = torch.arange(word_len)[None, :].to(ph2word.device) + 1 # [B, T_mel, T_word]
src_nonpadding = (txt_tokens > 0).float()[:, :, None]
use_bert = hparams.get("use_bert") is True
if use_bert:
ph_encoder_out = self.ph_encoder(txt_tokens, bert_feats=kwargs['bert_feats'], ph2word=ph2word,
graph_lst=kwargs['graph_lst'], etypes_lst=kwargs['etypes_lst'],
cl_feats=kwargs['cl_feats'], ret=ret) * src_nonpadding + style_embed
else:
ph_encoder_out = self.ph_encoder(txt_tokens) * src_nonpadding + style_embed
if hparams['use_word_encoder']:
word_encoder_out = self.word_encoder(word_tokens) + style_embed
ph_encoder_out = ph_encoder_out + expand_states(word_encoder_out, ph2word)
if hparams['dur_level'] == 'word':
word_encoder_out = 0
h_ph_gb_word = group_hidden_by_segs(ph_encoder_out, ph2word, word_len)[0]
word_encoder_out = word_encoder_out + self.ph2word_encoder(h_ph_gb_word)
if hparams['use_word_encoder']:
word_encoder_out = word_encoder_out + self.word_encoder(word_tokens)
mel2word = self.forward_dur(ph_encoder_out, mel2word, ret, ph2word=ph2word, word_len=word_len)
mel2word = clip_mel2token_to_multiple(mel2word, hparams['frames_multiple'])
tgt_nonpadding = (mel2word > 0).float()[:, :, None]
enc_pos = self.get_pos_embed(word2word, ph2word) # [B, T_ph, H]
dec_pos = self.get_pos_embed(word2word, mel2word) # [B, T_mel, H]
dec_word_mask = build_word_mask(mel2word, ph2word) # [B, T_mel, T_ph]
x, weight = self.attention(ph_encoder_out, enc_pos, word_encoder_out, dec_pos, mel2word, dec_word_mask)
if hparams['add_word_pos']:
x = x + self.word_pos_proj(dec_pos)
ret['attn'] = weight
else:
mel2ph = self.forward_dur(ph_encoder_out, mel2ph, ret)
mel2ph = clip_mel2token_to_multiple(mel2ph, hparams['frames_multiple'])
mel2word = mel2ph_to_mel2word(mel2ph, ph2word)
x = expand_states(ph_encoder_out, mel2ph)
if hparams['add_word_pos']:
dec_pos = self.get_pos_embed(word2word, mel2word) # [B, T_mel, H]
x = x + self.word_pos_proj(dec_pos)
tgt_nonpadding = (mel2ph > 0).float()[:, :, None]
if hparams['use_word_encoder']:
x = x + expand_states(word_encoder_out, mel2word)
return x, tgt_nonpadding
def attention(self, ph_encoder_out, enc_pos, word_encoder_out, dec_pos, mel2word, dec_word_mask):
ph_kv = self.enc_pos_proj(torch.cat([ph_encoder_out, enc_pos], -1))
word_enc_out_expend = expand_states(word_encoder_out, mel2word)
word_enc_out_expend = torch.cat([word_enc_out_expend, dec_pos], -1)
if hparams['text_encoder_postnet']:
word_enc_out_expend = self.dec_res_proj(word_enc_out_expend)
word_enc_out_expend = self.text_encoder_postnet(word_enc_out_expend)
dec_q = x_res = word_enc_out_expend
else:
dec_q = self.dec_query_proj(word_enc_out_expend)
x_res = self.dec_res_proj(word_enc_out_expend)
ph_kv, dec_q = ph_kv.transpose(0, 1), dec_q.transpose(0, 1)
x, (weight, _) = self.attn(dec_q, ph_kv, ph_kv, attn_mask=(1 - dec_word_mask) * -1e9)
x = x.transpose(0, 1)
x = x + x_res
return x, weight
def run_decoder(self, x, tgt_nonpadding, ret, infer, tgt_mels=None, global_step=0):
if not hparams['use_fvae']:
x = self.decoder(x)
x = self.mel_out(x)
ret['kl'] = 0
return x * tgt_nonpadding
else:
decoder_inp = x
x = x.transpose(1, 2) # [B, H, T]
tgt_nonpadding_BHT = tgt_nonpadding.transpose(1, 2) # [B, H, T]
if infer:
z = self.fvae(cond=x, infer=True)
else:
tgt_mels = tgt_mels.transpose(1, 2) # [B, 80, T]
z, ret['kl'], ret['z_p'], ret['m_q'], ret['logs_q'] = self.fvae(
tgt_mels, tgt_nonpadding_BHT, cond=x)
if global_step < hparams['posterior_start_steps']:
z = torch.randn_like(z)
x_recon = self.fvae.decoder(z, nonpadding=tgt_nonpadding_BHT, cond=x).transpose(1, 2)
ret['pre_mel_out'] = x_recon
return x_recon
def forward_dur(self, dur_input, mel2word, ret, **kwargs):
"""
:param dur_input: [B, T_txt, H]
:param mel2ph: [B, T_mel]
:param txt_tokens: [B, T_txt]
:param ret:
:return:
"""
src_padding = dur_input.data.abs().sum(-1) == 0
dur_input = dur_input.detach() + hparams['predictor_grad'] * (dur_input - dur_input.detach())
dur = self.dur_predictor(dur_input, src_padding)
if hparams['dur_level'] == 'word':
word_len = kwargs['word_len']
ph2word = kwargs['ph2word']
B, T_ph = ph2word.shape
dur = torch.zeros([B, word_len.max() + 1]).to(ph2word.device).scatter_add(1, ph2word, dur)
dur = dur[:, 1:]
ret['dur'] = dur
if mel2word is None:
mel2word = self.length_regulator(dur).detach()
return mel2word
def get_pos_embed(self, word2word, x2word):
x_pos = build_word_mask(word2word, x2word).float() # [B, T_word, T_ph]
x_pos = (x_pos.cumsum(-1) / x_pos.sum(-1).clamp(min=1)[..., None] * x_pos).sum(1)
x_pos = self.sin_pos(x_pos.float()) # [B, T_ph, H]
return x_pos
def store_inverse_all(self):
def remove_weight_norm(m):
try:
if hasattr(m, 'store_inverse'):
m.store_inverse()
nn.utils.remove_weight_norm(m)
except ValueError: # this module didn't have weight norm
return
self.apply(remove_weight_norm)

View File

@@ -1,75 +0,0 @@
import torch
import torch.distributions as dist
from torch import nn
from modules.commons.normalizing_flow.glow_modules import Glow
from modules.portaspeech.portaspeech import PortaSpeech
from utils.hparams import hparams
class PortaSpeechFlow(PortaSpeech):
def __init__(self, ph_dict_size, word_dict_size, out_dims=None):
super().__init__(ph_dict_size, word_dict_size, out_dims)
cond_hs = 80
if hparams.get('use_txt_cond', True):
cond_hs = cond_hs + hparams['hidden_size']
if hparams.get('use_latent_cond', False):
cond_hs = cond_hs + hparams['latent_size']
if hparams['use_cond_proj']:
self.g_proj = nn.Conv1d(cond_hs, 160, 5, padding=2)
cond_hs = 160
self.post_flow = Glow(
80, hparams['post_glow_hidden'], hparams['post_glow_kernel_size'], 1,
hparams['post_glow_n_blocks'], hparams['post_glow_n_block_layers'],
n_split=4, n_sqz=2,
gin_channels=cond_hs,
share_cond_layers=hparams['post_share_cond_layers'],
share_wn_layers=hparams['share_wn_layers'],
sigmoid_scale=hparams['sigmoid_scale']
)
self.prior_dist = dist.Normal(0, 1)
def forward(self, txt_tokens, word_tokens, ph2word, word_len, mel2word=None, mel2ph=None,
spk_embed=None, spk_id=None, pitch=None, infer=False, tgt_mels=None,
forward_post_glow=True, two_stage=True, global_step=None, **kwargs):
is_training = self.training
train_fvae = not (forward_post_glow and two_stage)
if not train_fvae:
self.eval()
with torch.set_grad_enabled(mode=train_fvae):
ret = super(PortaSpeechFlow, self).forward(
txt_tokens, word_tokens, ph2word, word_len, mel2word, mel2ph,
spk_embed, spk_id, pitch, infer, tgt_mels, global_step, **kwargs)
if (forward_post_glow or not two_stage) and hparams['use_post_flow']:
self.run_post_glow(tgt_mels, infer, is_training, ret)
return ret
def run_post_glow(self, tgt_mels, infer, is_training, ret):
x_recon = ret['mel_out'].transpose(1, 2)
g = x_recon
B, _, T = g.shape
if hparams.get('use_txt_cond', True):
g = torch.cat([g, ret['decoder_inp'].transpose(1, 2)], 1)
if hparams.get('use_latent_cond', False):
g_z = ret['z_p'][:, :, :, None].repeat(1, 1, 1, 4).reshape(B, -1, T)
g = torch.cat([g, g_z], 1)
if hparams['use_cond_proj']:
g = self.g_proj(g)
prior_dist = self.prior_dist
if not infer:
if is_training:
self.post_flow.train()
nonpadding = ret['nonpadding'].transpose(1, 2)
y_lengths = nonpadding.sum(-1)
if hparams['detach_postflow_input']:
g = g.detach()
tgt_mels = tgt_mels.transpose(1, 2)
z_postflow, ldj = self.post_flow(tgt_mels, nonpadding, g=g)
ldj = ldj / y_lengths / 80
ret['z_pf'], ret['ldj_pf'] = z_postflow, ldj
ret['postflow'] = -prior_dist.log_prob(z_postflow).mean() - ldj.mean()
if torch.isnan(ret['postflow']):
ret['postflow'] = None
else:
nonpadding = torch.ones_like(x_recon[:, :1, :])
z_post = torch.randn(x_recon.shape).to(g.device) * hparams['noise_scale']
x_recon, _ = self.post_flow(z_post, nonpadding, g, reverse=True)
ret['mel_out'] = x_recon.transpose(1, 2)

View File

@@ -51,16 +51,16 @@ from target_sound_detection.src.models import event_labels
from target_sound_detection.src.utils import median_filter, decode_with_timestamps
import clip
import numpy as np
AUDIO_CHATGPT_PREFIX = """Audio ChatGPT
Audio ChatGPT can not directly read audios, but it has a list of tools to finish different speech, audio, and singing voice tasks. Each audio will have a file name formed as "audio/xxx.wav". When talking about audios, Audio ChatGPT is very strict to the file name and will never fabricate nonexistent files.
Audio ChatGPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the audio content and audio file name. It will remember to provide the file name from the last tool observation, if a new audio is generated.
Human may provide new audios to Audio ChatGPT with a description. The description helps Audio ChatGPT to understand this audio, but Audio ChatGPT should use tools to finish following tasks, rather than directly imagine from the description.
Overall, Audio ChatGPT is a powerful audio dialogue assistant tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics.
AUDIO_CHATGPT_PREFIX = """AudioGPT
AudioGPT can not directly read audios, but it has a list of tools to finish different speech, audio, and singing voice tasks. Each audio will have a file name formed as "audio/xxx.wav". When talking about audios, AudioGPT is very strict to the file name and will never fabricate nonexistent files.
AudioGPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the audio content and audio file name. It will remember to provide the file name from the last tool observation, if a new audio is generated.
Human may provide new audios to AudioGPT with a description. The description helps AudioGPT to understand this audio, but AudioGPT should use tools to finish following tasks, rather than directly imagine from the description.
Overall, AudioGPT is a powerful audio dialogue assistant tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics.
TOOLS:
------
Audio ChatGPT has access to the following tools:"""
AudioGPT has access to the following tools:"""
AUDIO_CHATGPT_FORMAT_INSTRUCTIONS = """To use a tool, please use the following format:
@@ -781,7 +781,7 @@ class TargetSoundDetection:
class ConversationBot:
def __init__(self):
print("Initializing AudioChatGPT")
print("Initializing AudioGPT")
self.llm = OpenAI(temperature=0)
self.t2i = T2I(device="cuda:0")
self.i2t = ImageCaptioning(device="cuda:1")
@@ -959,8 +959,8 @@ if __name__ == '__main__':
bot = ConversationBot()
with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
with gr.Row():
gr.Markdown("## Audio ChatGPT")
chatbot = gr.Chatbot(elem_id="chatbot", label="Audio ChatGPT")
gr.Markdown("## AudioGPT")
chatbot = gr.Chatbot(elem_id="chatbot", label="AudioGPT")
state = gr.State([])
with gr.Row():
with gr.Column(scale=0.7):

View File

@@ -32,7 +32,7 @@ wget -P text_to_speech/checkpoints/ljspeech/ps_adv_baseline -i https://huggingfa
wget -P audio_to_text/audiocaps_cntrstv_cnn14rnn_trm -i https://huggingface.co/AIGC-Audio/AudioGPT/blob/main/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/config.yaml https://huggingface.co/AIGC-Audio/AudioGPT/resolve/main/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/swa.pth
wget -P audio_to_text/clotho_cntrstv_cnn14rnn_trm -i https://huggingface.co/AIGC-Audio/AudioGPT/blob/main/audio_to_text/clotho_cntrstv_cnn14rnn_trm/config.yaml https://huggingface.co/AIGC-Audio/AudioGPT/resolve/main/audio_to_text/clotho_cntrstv_cnn14rnn_trm/swa.pth
wget -P audio_to_text/pretrained_feature_extractors https://huggingface.co/AIGC-Audio/AudioGPT/resolve/main/audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth
# audio detection
# Audio detection
cd audio_detection/audio_infer/useful_ckpts
wget https://huggingface.co/Dongchao/pre_trained_model/resolve/main/audio_detection.pth
cd mono2binaural/useful_ckpts