mirror of
https://github.com/AIGC-Audio/AudioGPT.git
synced 2025-12-16 20:07:58 +01:00
@@ -1,9 +1 @@
|
||||
---
|
||||
title: DiffSinger🎶 Diffusion for Singing Voice Synthesis
|
||||
emoji: 🎶
|
||||
colorFrom: purple
|
||||
colorTo: blue
|
||||
sdk: gradio
|
||||
app_file: "inference/svs/gradio/infer.py"
|
||||
pinned: false
|
||||
---
|
||||
In this directory, we support FastSpeech, GenerSpeech, SyntaSpeech, DiffSinger
|
||||
@@ -1,188 +0,0 @@
|
||||
from utils.hparams import hparams
|
||||
import torch
|
||||
from torch import nn
|
||||
import torch.nn.functional as F
|
||||
from modules.commons.conv import TextConvEncoder, ConvBlocks
|
||||
from modules.commons.common_layers import Embedding
|
||||
from modules.fastspeech.tts_modules import LayerNorm, PitchPredictor, LengthRegulator
|
||||
from modules.commons.rel_transformer import RelTransformerEncoder, BERTRelTransformerEncoder
|
||||
from modules.commons.align_ops import clip_mel2token_to_multiple, expand_states
|
||||
from utils.pitch_utils import denorm_f0, f0_to_coarse
|
||||
|
||||
FS_ENCODERS = {
|
||||
'rel_fft': lambda hp, dict: RelTransformerEncoder(
|
||||
len(dict), hp['hidden_size'], hp['hidden_size'],
|
||||
hp['ffn_hidden_size'], hp['num_heads'], hp['enc_layers'],
|
||||
hp['enc_ffn_kernel_size'], hp['dropout'], prenet=hp['enc_prenet'], pre_ln=hp['enc_pre_ln']),
|
||||
}
|
||||
|
||||
FS_DECODERS = {
|
||||
'conv': lambda hp: ConvBlocks(hp['hidden_size'], hp['hidden_size'], hp['dec_dilations'],
|
||||
hp['dec_kernel_size'], layers_in_block=hp['layers_in_block'],
|
||||
norm_type=hp['enc_dec_norm'], dropout=hp['dropout'],
|
||||
post_net_kernel=hp.get('dec_post_net_kernel', 3)),
|
||||
}
|
||||
|
||||
class DurationPredictor(torch.nn.Module):
|
||||
def __init__(self, idim, n_layers=2, n_chans=384, kernel_size=3, dropout_rate=0.1, offset=1.0):
|
||||
super(DurationPredictor, self).__init__()
|
||||
self.offset = offset
|
||||
self.conv = torch.nn.ModuleList()
|
||||
self.kernel_size = kernel_size
|
||||
for idx in range(n_layers):
|
||||
in_chans = idim if idx == 0 else n_chans
|
||||
self.conv += [torch.nn.Sequential(
|
||||
torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2),
|
||||
torch.nn.ReLU(),
|
||||
LayerNorm(n_chans, dim=1),
|
||||
torch.nn.Dropout(dropout_rate)
|
||||
)]
|
||||
self.linear = nn.Sequential(torch.nn.Linear(n_chans, 1), nn.Softplus())
|
||||
|
||||
def forward(self, x, x_padding=None):
|
||||
x = x.transpose(1, -1) # (B, idim, Tmax)
|
||||
for f in self.conv:
|
||||
x = f(x) # (B, C, Tmax)
|
||||
if x_padding is not None:
|
||||
x = x * (1 - x_padding.float())[:, None, :]
|
||||
|
||||
x = self.linear(x.transpose(1, -1)) # [B, T, C]
|
||||
x = x * (1 - x_padding.float())[:, :, None] # (B, T, C)
|
||||
x = x[..., 0] # (B, Tmax)
|
||||
return x
|
||||
|
||||
class FastSpeech(nn.Module):
|
||||
def __init__(self, dict_size, out_dims=None):
|
||||
super().__init__()
|
||||
self.enc_layers = hparams['enc_layers']
|
||||
self.dec_layers = hparams['dec_layers']
|
||||
self.hidden_size = hparams['hidden_size']
|
||||
if hparams.get("use_bert") is True:
|
||||
self.ph_encoder = BERTRelTransformerEncoder(dict_size, hparams['hidden_size'], hparams['hidden_size'],
|
||||
hparams['ffn_hidden_size'], hparams['num_heads'], hparams['enc_layers'],
|
||||
hparams['enc_ffn_kernel_size'], hparams['dropout'], prenet=hparams['enc_prenet'], pre_ln=hparams['enc_pre_ln'])
|
||||
else:
|
||||
self.ph_encoder = FS_ENCODERS[hparams['encoder_type']](hparams, dict_size)
|
||||
self.decoder = FS_DECODERS[hparams['decoder_type']](hparams)
|
||||
self.out_dims = hparams['audio_num_mel_bins'] if out_dims is None else out_dims
|
||||
self.mel_out = nn.Linear(self.hidden_size, self.out_dims, bias=True)
|
||||
if hparams['use_spk_id']:
|
||||
self.spk_id_proj = Embedding(hparams['num_spk'], self.hidden_size)
|
||||
if hparams['use_spk_embed']:
|
||||
self.spk_embed_proj = nn.Linear(256, self.hidden_size, bias=True)
|
||||
predictor_hidden = hparams['predictor_hidden'] if hparams['predictor_hidden'] > 0 else self.hidden_size
|
||||
self.dur_predictor = DurationPredictor(
|
||||
self.hidden_size,
|
||||
n_chans=predictor_hidden,
|
||||
n_layers=hparams['dur_predictor_layers'],
|
||||
dropout_rate=hparams['predictor_dropout'],
|
||||
kernel_size=hparams['dur_predictor_kernel'])
|
||||
self.length_regulator = LengthRegulator()
|
||||
if hparams['use_pitch_embed']:
|
||||
self.pitch_embed = Embedding(300, self.hidden_size, 0)
|
||||
self.pitch_predictor = PitchPredictor(
|
||||
self.hidden_size, n_chans=predictor_hidden,
|
||||
n_layers=5, dropout_rate=0.1, odim=2,
|
||||
kernel_size=hparams['predictor_kernel'])
|
||||
if hparams['dec_inp_add_noise']:
|
||||
self.z_channels = hparams['z_channels']
|
||||
self.dec_inp_noise_proj = nn.Linear(self.hidden_size + self.z_channels, self.hidden_size)
|
||||
|
||||
def forward(self, txt_tokens, mel2ph=None, spk_embed=None, spk_id=None,
|
||||
f0=None, uv=None, infer=False, **kwargs):
|
||||
ret = {}
|
||||
src_nonpadding = (txt_tokens > 0).float()[:, :, None]
|
||||
style_embed = self.forward_style_embed(spk_embed, spk_id)
|
||||
|
||||
use_bert = hparams.get("use_bert") is True
|
||||
if use_bert:
|
||||
encoder_out = self.encoder(txt_tokens, bert_feats=kwargs['bert_feats'], ph2word=kwargs['ph2word'],
|
||||
ret=ret) * src_nonpadding + style_embed
|
||||
else:
|
||||
encoder_out = self.encoder(txt_tokens) * src_nonpadding + style_embed
|
||||
|
||||
# add dur
|
||||
dur_inp = (encoder_out + style_embed) * src_nonpadding
|
||||
mel2ph = self.forward_dur(dur_inp, mel2ph, txt_tokens, ret)
|
||||
tgt_nonpadding = (mel2ph > 0).float()[:, :, None]
|
||||
decoder_inp = expand_states(encoder_out, mel2ph)
|
||||
|
||||
# add pitch embed
|
||||
if hparams['use_pitch_embed']:
|
||||
pitch_inp = (decoder_inp + style_embed) * tgt_nonpadding
|
||||
decoder_inp = decoder_inp + self.forward_pitch(pitch_inp, f0, uv, mel2ph, ret, encoder_out)
|
||||
|
||||
# decoder input
|
||||
ret['decoder_inp'] = decoder_inp = (decoder_inp + style_embed) * tgt_nonpadding
|
||||
if hparams['dec_inp_add_noise']:
|
||||
B, T, _ = decoder_inp.shape
|
||||
z = kwargs.get('adv_z', torch.randn([B, T, self.z_channels])).to(decoder_inp.device)
|
||||
ret['adv_z'] = z
|
||||
decoder_inp = torch.cat([decoder_inp, z], -1)
|
||||
decoder_inp = self.dec_inp_noise_proj(decoder_inp) * tgt_nonpadding
|
||||
ret['mel_out'] = self.forward_decoder(decoder_inp, tgt_nonpadding, ret, infer=infer, **kwargs)
|
||||
return ret
|
||||
|
||||
def forward_style_embed(self, spk_embed=None, spk_id=None):
|
||||
# add spk embed
|
||||
style_embed = 0
|
||||
if hparams['use_spk_embed']:
|
||||
style_embed = style_embed + self.spk_embed_proj(spk_embed)[:, None, :]
|
||||
if hparams['use_spk_id']:
|
||||
style_embed = style_embed + self.spk_id_proj(spk_id)[:, None, :]
|
||||
return style_embed
|
||||
|
||||
def forward_dur(self, dur_input, mel2ph, txt_tokens, ret):
|
||||
"""
|
||||
|
||||
:param dur_input: [B, T_txt, H]
|
||||
:param mel2ph: [B, T_mel]
|
||||
:param txt_tokens: [B, T_txt]
|
||||
:param ret:
|
||||
:return:
|
||||
"""
|
||||
src_padding = txt_tokens == 0
|
||||
if hparams['predictor_grad'] != 1:
|
||||
dur_input = dur_input.detach() + hparams['predictor_grad'] * (dur_input - dur_input.detach())
|
||||
dur = self.dur_predictor(dur_input, src_padding)
|
||||
ret['dur'] = dur
|
||||
if mel2ph is None:
|
||||
mel2ph = self.length_regulator(dur, src_padding).detach()
|
||||
ret['mel2ph'] = mel2ph = clip_mel2token_to_multiple(mel2ph, hparams['frames_multiple'])
|
||||
return mel2ph
|
||||
|
||||
def forward_pitch(self, decoder_inp, f0, uv, mel2ph, ret, encoder_out=None):
|
||||
if hparams['pitch_type'] == 'frame':
|
||||
pitch_pred_inp = decoder_inp
|
||||
pitch_padding = mel2ph == 0
|
||||
else:
|
||||
pitch_pred_inp = encoder_out
|
||||
pitch_padding = encoder_out.abs().sum(-1) == 0
|
||||
uv = None
|
||||
if hparams['predictor_grad'] != 1:
|
||||
pitch_pred_inp = pitch_pred_inp.detach() + \
|
||||
hparams['predictor_grad'] * (pitch_pred_inp - pitch_pred_inp.detach())
|
||||
ret['pitch_pred'] = pitch_pred = self.pitch_predictor(pitch_pred_inp)
|
||||
use_uv = hparams['pitch_type'] == 'frame' and hparams['use_uv']
|
||||
if f0 is None:
|
||||
f0 = pitch_pred[:, :, 0]
|
||||
if use_uv:
|
||||
uv = pitch_pred[:, :, 1] > 0
|
||||
f0_denorm = denorm_f0(f0, uv if use_uv else None, pitch_padding=pitch_padding)
|
||||
pitch = f0_to_coarse(f0_denorm) # start from 0 [B, T_txt]
|
||||
ret['f0_denorm'] = f0_denorm
|
||||
ret['f0_denorm_pred'] = denorm_f0(
|
||||
pitch_pred[:, :, 0], (pitch_pred[:, :, 1] > 0) if use_uv else None,
|
||||
pitch_padding=pitch_padding)
|
||||
if hparams['pitch_type'] == 'ph':
|
||||
pitch = torch.gather(F.pad(pitch, [1, 0]), 1, mel2ph)
|
||||
ret['f0_denorm'] = torch.gather(F.pad(ret['f0_denorm'], [1, 0]), 1, mel2ph)
|
||||
ret['f0_denorm_pred'] = torch.gather(F.pad(ret['f0_denorm_pred'], [1, 0]), 1, mel2ph)
|
||||
pitch_embed = self.pitch_embed(pitch)
|
||||
return pitch_embed
|
||||
|
||||
def forward_decoder(self, decoder_inp, tgt_nonpadding, ret, infer, **kwargs):
|
||||
x = decoder_inp # [B, T, H]
|
||||
x = self.decoder(x)
|
||||
x = self.mel_out(x)
|
||||
return x * tgt_nonpadding
|
||||
@@ -1,202 +0,0 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.distributions as dist
|
||||
from torch import nn
|
||||
|
||||
from modules.commons.conv import ConditionalConvBlocks
|
||||
from modules.commons.normalizing_flow.res_flow import ResFlow
|
||||
from modules.commons.wavenet import WN
|
||||
|
||||
|
||||
class FVAEEncoder(nn.Module):
|
||||
def __init__(self, c_in, hidden_size, c_latent, kernel_size,
|
||||
n_layers, c_cond=0, p_dropout=0, strides=[4], nn_type='wn'):
|
||||
super().__init__()
|
||||
self.strides = strides
|
||||
self.hidden_size = hidden_size
|
||||
if np.prod(strides) == 1:
|
||||
self.pre_net = nn.Conv1d(c_in, hidden_size, kernel_size=1)
|
||||
else:
|
||||
self.pre_net = nn.Sequential(*[
|
||||
nn.Conv1d(c_in, hidden_size, kernel_size=s * 2, stride=s, padding=s // 2)
|
||||
if i == 0 else
|
||||
nn.Conv1d(hidden_size, hidden_size, kernel_size=s * 2, stride=s, padding=s // 2)
|
||||
for i, s in enumerate(strides)
|
||||
])
|
||||
if nn_type == 'wn':
|
||||
self.nn = WN(hidden_size, kernel_size, 1, n_layers, c_cond, p_dropout)
|
||||
elif nn_type == 'conv':
|
||||
self.nn = ConditionalConvBlocks(
|
||||
hidden_size, c_cond, hidden_size, None, kernel_size,
|
||||
layers_in_block=2, is_BTC=False, num_layers=n_layers)
|
||||
|
||||
self.out_proj = nn.Conv1d(hidden_size, c_latent * 2, 1)
|
||||
self.latent_channels = c_latent
|
||||
|
||||
def forward(self, x, nonpadding, cond):
|
||||
x = self.pre_net(x)
|
||||
nonpadding = nonpadding[:, :, ::np.prod(self.strides)][:, :, :x.shape[-1]]
|
||||
x = x * nonpadding
|
||||
x = self.nn(x, nonpadding=nonpadding, cond=cond) * nonpadding
|
||||
x = self.out_proj(x)
|
||||
m, logs = torch.split(x, self.latent_channels, dim=1)
|
||||
z = (m + torch.randn_like(m) * torch.exp(logs))
|
||||
return z, m, logs, nonpadding
|
||||
|
||||
|
||||
class FVAEDecoder(nn.Module):
|
||||
def __init__(self, c_latent, hidden_size, out_channels, kernel_size,
|
||||
n_layers, c_cond=0, p_dropout=0, strides=[4], nn_type='wn'):
|
||||
super().__init__()
|
||||
self.strides = strides
|
||||
self.hidden_size = hidden_size
|
||||
self.pre_net = nn.Sequential(*[
|
||||
nn.ConvTranspose1d(c_latent, hidden_size, kernel_size=s, stride=s)
|
||||
if i == 0 else
|
||||
nn.ConvTranspose1d(hidden_size, hidden_size, kernel_size=s, stride=s)
|
||||
for i, s in enumerate(strides)
|
||||
])
|
||||
if nn_type == 'wn':
|
||||
self.nn = WN(hidden_size, kernel_size, 1, n_layers, c_cond, p_dropout)
|
||||
elif nn_type == 'conv':
|
||||
self.nn = ConditionalConvBlocks(
|
||||
hidden_size, c_cond, hidden_size, [1] * n_layers, kernel_size,
|
||||
layers_in_block=2, is_BTC=False)
|
||||
self.out_proj = nn.Conv1d(hidden_size, out_channels, 1)
|
||||
|
||||
def forward(self, x, nonpadding, cond):
|
||||
x = self.pre_net(x)
|
||||
x = x * nonpadding
|
||||
x = self.nn(x, nonpadding=nonpadding, cond=cond) * nonpadding
|
||||
x = self.out_proj(x)
|
||||
return x
|
||||
|
||||
|
||||
class FVAE(nn.Module):
|
||||
def __init__(self,
|
||||
c_in_out, hidden_size, c_latent,
|
||||
kernel_size, enc_n_layers, dec_n_layers, c_cond, strides,
|
||||
use_prior_flow, flow_hidden=None, flow_kernel_size=None, flow_n_steps=None,
|
||||
encoder_type='wn', decoder_type='wn'):
|
||||
super(FVAE, self).__init__()
|
||||
self.strides = strides
|
||||
self.hidden_size = hidden_size
|
||||
self.latent_size = c_latent
|
||||
self.use_prior_flow = use_prior_flow
|
||||
if np.prod(strides) == 1:
|
||||
self.g_pre_net = nn.Conv1d(c_cond, c_cond, kernel_size=1)
|
||||
else:
|
||||
self.g_pre_net = nn.Sequential(*[
|
||||
nn.Conv1d(c_cond, c_cond, kernel_size=s * 2, stride=s, padding=s // 2)
|
||||
for i, s in enumerate(strides)
|
||||
])
|
||||
self.encoder = FVAEEncoder(c_in_out, hidden_size, c_latent, kernel_size,
|
||||
enc_n_layers, c_cond, strides=strides, nn_type=encoder_type)
|
||||
if use_prior_flow:
|
||||
self.prior_flow = ResFlow(
|
||||
c_latent, flow_hidden, flow_kernel_size, flow_n_steps, 4, c_cond=c_cond)
|
||||
self.decoder = FVAEDecoder(c_latent, hidden_size, c_in_out, kernel_size,
|
||||
dec_n_layers, c_cond, strides=strides, nn_type=decoder_type)
|
||||
self.prior_dist = dist.Normal(0, 1)
|
||||
|
||||
def forward(self, x=None, nonpadding=None, cond=None, infer=False, noise_scale=1.0, **kwargs):
|
||||
"""
|
||||
|
||||
:param x: [B, C_in_out, T]
|
||||
:param nonpadding: [B, 1, T]
|
||||
:param cond: [B, C_g, T]
|
||||
:return:
|
||||
"""
|
||||
if nonpadding is None:
|
||||
nonpadding = 1
|
||||
cond_sqz = self.g_pre_net(cond)
|
||||
if not infer:
|
||||
z_q, m_q, logs_q, nonpadding_sqz = self.encoder(x, nonpadding, cond_sqz)
|
||||
q_dist = dist.Normal(m_q, logs_q.exp())
|
||||
if self.use_prior_flow:
|
||||
logqx = q_dist.log_prob(z_q)
|
||||
z_p = self.prior_flow(z_q, nonpadding_sqz, cond_sqz)
|
||||
logpx = self.prior_dist.log_prob(z_p)
|
||||
loss_kl = ((logqx - logpx) * nonpadding_sqz).sum() / nonpadding_sqz.sum() / logqx.shape[1]
|
||||
else:
|
||||
loss_kl = torch.distributions.kl_divergence(q_dist, self.prior_dist)
|
||||
loss_kl = (loss_kl * nonpadding_sqz).sum() / nonpadding_sqz.sum() / z_q.shape[1]
|
||||
z_p = None
|
||||
return z_q, loss_kl, z_p, m_q, logs_q
|
||||
else:
|
||||
latent_shape = [cond_sqz.shape[0], self.latent_size, cond_sqz.shape[2]]
|
||||
z_p = torch.randn(latent_shape).to(cond.device) * noise_scale
|
||||
if self.use_prior_flow:
|
||||
z_p = self.prior_flow(z_p, 1, cond_sqz, reverse=True)
|
||||
return z_p
|
||||
|
||||
|
||||
class SyntaFVAE(nn.Module):
|
||||
def __init__(self,
|
||||
c_in_out, hidden_size, c_latent,
|
||||
kernel_size, enc_n_layers, dec_n_layers, c_cond, strides,
|
||||
use_prior_flow, flow_hidden=None, flow_kernel_size=None, flow_n_steps=None,
|
||||
encoder_type='wn', decoder_type='wn'):
|
||||
super(SyntaFVAE, self).__init__()
|
||||
self.strides = strides
|
||||
self.hidden_size = hidden_size
|
||||
self.latent_size = c_latent
|
||||
self.use_prior_flow = use_prior_flow
|
||||
if np.prod(strides) == 1:
|
||||
self.g_pre_net = nn.Conv1d(c_cond, c_cond, kernel_size=1)
|
||||
else:
|
||||
self.g_pre_net = nn.Sequential(*[
|
||||
nn.Conv1d(c_cond, c_cond, kernel_size=s * 2, stride=s, padding=s // 2)
|
||||
for i, s in enumerate(strides)
|
||||
])
|
||||
self.encoder = FVAEEncoder(c_in_out, hidden_size, c_latent, kernel_size,
|
||||
enc_n_layers, c_cond, strides=strides, nn_type=encoder_type)
|
||||
if use_prior_flow:
|
||||
self.prior_flow = ResFlow(
|
||||
c_latent, flow_hidden, flow_kernel_size, flow_n_steps, 4, c_cond=c_cond)
|
||||
self.decoder = FVAEDecoder(c_latent, hidden_size, c_in_out, kernel_size,
|
||||
dec_n_layers, c_cond, strides=strides, nn_type=decoder_type)
|
||||
self.prior_dist = dist.Normal(0, 1)
|
||||
self.graph_encoder = GraphAuxEnc(in_dim=hidden_size, hid_dim=hidden_size,out_dim=hidden_size)
|
||||
|
||||
def forward(self, x=None, nonpadding=None, cond=None, infer=False, noise_scale=1.0,
|
||||
mel2word=None, ph2word=None, graph_lst=None, etypes_lst=None):
|
||||
"""
|
||||
|
||||
:param x: target mel, [B, C_in_out, T]
|
||||
:param nonpadding: [B, 1, T]
|
||||
:param cond: phoneme encoding, [B, C_g, T]
|
||||
:return:
|
||||
"""
|
||||
word_len = ph2word.max(dim=1)[0]
|
||||
ph_encoding_for_graph = cond.detach() + 0.1 * (cond - cond.detach()) # only 0.1x grad can pass through
|
||||
_, ph_out_word_encoding_for_graph = GraphAuxEnc.ph_encoding_to_word_encoding(ph_encoding_for_graph.transpose(1,2), mel2word, word_len)
|
||||
t_m = mel2word.shape[-1]
|
||||
g_graph = self.graph_encoder.word_forward(graph_lst=graph_lst, word_encoding=ph_out_word_encoding_for_graph, etypes_lst=etypes_lst)
|
||||
g_graph = g_graph.transpose(1,2)
|
||||
g_graph = GraphAuxEnc._postprocess_word2ph(g_graph,mel2word,t_m)
|
||||
g_graph = g_graph.transpose(1,2)
|
||||
cond = cond + g_graph * 1.
|
||||
|
||||
if nonpadding is None:
|
||||
nonpadding = 1
|
||||
cond_sqz = self.g_pre_net(cond)
|
||||
if not infer:
|
||||
z_q, m_q, logs_q, nonpadding_sqz = self.encoder(x, nonpadding, cond_sqz)
|
||||
q_dist = dist.Normal(m_q, logs_q.exp())
|
||||
if self.use_prior_flow:
|
||||
logqx = q_dist.log_prob(z_q)
|
||||
z_p = self.prior_flow(z_q, nonpadding_sqz, cond_sqz)
|
||||
logpx = self.prior_dist.log_prob(z_p)
|
||||
loss_kl = ((logqx - logpx) * nonpadding_sqz).sum() / nonpadding_sqz.sum() / logqx.shape[1]
|
||||
else:
|
||||
loss_kl = torch.distributions.kl_divergence(q_dist, self.prior_dist)
|
||||
loss_kl = (loss_kl * nonpadding_sqz).sum() / nonpadding_sqz.sum() / z_q.shape[1]
|
||||
z_p = None
|
||||
return z_q, loss_kl, z_p, m_q, logs_q
|
||||
else:
|
||||
latent_shape = [cond_sqz.shape[0], self.latent_size, cond_sqz.shape[2]]
|
||||
z_p = torch.randn(latent_shape).to(cond.device) * noise_scale
|
||||
if self.use_prior_flow:
|
||||
z_p = self.prior_flow(z_p, 1, cond_sqz, reverse=True)
|
||||
return z_p
|
||||
@@ -1,230 +0,0 @@
|
||||
import math
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import Linear
|
||||
|
||||
from modules.commons.conv import ConvBlocks, ConditionalConvBlocks
|
||||
from modules.commons.common_layers import Embedding
|
||||
from modules.commons.rel_transformer import RelTransformerEncoder
|
||||
from modules.commons.transformer import MultiheadAttention, FFTBlocks
|
||||
from modules.commons.align_ops import clip_mel2token_to_multiple, build_word_mask, expand_states, mel2ph_to_mel2word
|
||||
from modules.portaspeech.fs import FS_DECODERS, FastSpeech
|
||||
from modules.portaspeech.fvae import FVAE
|
||||
from utils.tts_utils import group_hidden_by_segs
|
||||
from utils.hparams import hparams
|
||||
|
||||
class SinusoidalPosEmb(nn.Module):
|
||||
def __init__(self, dim):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
|
||||
:param x: [B, T]
|
||||
:return: [B, T, H]
|
||||
"""
|
||||
device = x.device
|
||||
half_dim = self.dim // 2
|
||||
emb = math.log(10000) / (half_dim - 1)
|
||||
emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
|
||||
emb = x[:, :, None] * emb[None, :]
|
||||
emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
|
||||
return emb
|
||||
|
||||
|
||||
class PortaSpeech(FastSpeech):
|
||||
def __init__(self, ph_dictionary, word_dictionary, out_dims=None):
|
||||
super().__init__(ph_dictionary, out_dims)
|
||||
# build linguistic encoder
|
||||
if hparams['use_word_encoder']:
|
||||
# default False, use independent word embedding instead of phoneme encoding to represent word
|
||||
self.word_encoder = RelTransformerEncoder(
|
||||
len(word_dictionary), self.hidden_size, self.hidden_size, self.hidden_size, 2,
|
||||
hparams['word_enc_layers'], hparams['enc_ffn_kernel_size'])
|
||||
if hparams['dur_level'] == 'word':
|
||||
if hparams['word_encoder_type'] == 'rel_fft':
|
||||
self.ph2word_encoder = RelTransformerEncoder(
|
||||
0, self.hidden_size, self.hidden_size, self.hidden_size, 2,
|
||||
hparams['word_enc_layers'], hparams['enc_ffn_kernel_size'])
|
||||
if hparams['word_encoder_type'] == 'fft':
|
||||
self.ph2word_encoder = FFTBlocks(
|
||||
self.hidden_size, hparams['word_enc_layers'], 1, num_heads=hparams['num_heads'])
|
||||
self.sin_pos = SinusoidalPosEmb(self.hidden_size)
|
||||
self.enc_pos_proj = nn.Linear(2 * self.hidden_size, self.hidden_size)
|
||||
self.dec_query_proj = nn.Linear(2 * self.hidden_size, self.hidden_size)
|
||||
self.dec_res_proj = nn.Linear(2 * self.hidden_size, self.hidden_size)
|
||||
self.attn = MultiheadAttention(self.hidden_size, 1, encoder_decoder_attention=True, bias=False)
|
||||
self.attn.enable_torch_version = False
|
||||
if hparams['text_encoder_postnet']:
|
||||
self.text_encoder_postnet = ConvBlocks(
|
||||
self.hidden_size, self.hidden_size, [1] * 3, 5, layers_in_block=2)
|
||||
else:
|
||||
self.sin_pos = SinusoidalPosEmb(self.hidden_size)
|
||||
# build VAE decoder
|
||||
if hparams['use_fvae']:
|
||||
del self.decoder
|
||||
del self.mel_out
|
||||
self.fvae = FVAE(
|
||||
c_in_out=self.out_dims,
|
||||
hidden_size=hparams['fvae_enc_dec_hidden'], c_latent=hparams['latent_size'],
|
||||
kernel_size=hparams['fvae_kernel_size'],
|
||||
enc_n_layers=hparams['fvae_enc_n_layers'],
|
||||
dec_n_layers=hparams['fvae_dec_n_layers'],
|
||||
c_cond=self.hidden_size,
|
||||
use_prior_flow=hparams['use_prior_flow'],
|
||||
flow_hidden=hparams['prior_flow_hidden'],
|
||||
flow_kernel_size=hparams['prior_flow_kernel_size'],
|
||||
flow_n_steps=hparams['prior_flow_n_blocks'],
|
||||
strides=[hparams['fvae_strides']],
|
||||
encoder_type=hparams['fvae_encoder_type'],
|
||||
decoder_type=hparams['fvae_decoder_type'],
|
||||
)
|
||||
else:
|
||||
self.decoder = FS_DECODERS[hparams['decoder_type']](hparams)
|
||||
self.mel_out = Linear(self.hidden_size, self.out_dims, bias=True)
|
||||
if hparams['use_pitch_embed']:
|
||||
self.pitch_embed = Embedding(300, self.hidden_size, 0)
|
||||
if hparams['add_word_pos']:
|
||||
self.word_pos_proj = Linear(self.hidden_size, self.hidden_size)
|
||||
|
||||
def build_embedding(self, dictionary, embed_dim):
|
||||
num_embeddings = len(dictionary)
|
||||
emb = Embedding(num_embeddings, embed_dim, self.padding_idx)
|
||||
return emb
|
||||
|
||||
def forward(self, txt_tokens, word_tokens, ph2word, word_len, mel2word=None, mel2ph=None,
|
||||
spk_embed=None, spk_id=None, pitch=None, infer=False, tgt_mels=None,
|
||||
global_step=None, *args, **kwargs):
|
||||
ret = {}
|
||||
style_embed = self.forward_style_embed(spk_embed, spk_id)
|
||||
x, tgt_nonpadding = self.run_text_encoder(
|
||||
txt_tokens, word_tokens, ph2word, word_len, mel2word, mel2ph, style_embed, ret, **kwargs)
|
||||
x = x * tgt_nonpadding
|
||||
ret['nonpadding'] = tgt_nonpadding
|
||||
if hparams['use_pitch_embed']:
|
||||
x = x + self.pitch_embed(pitch)
|
||||
ret['decoder_inp'] = x
|
||||
ret['mel_out_fvae'] = ret['mel_out'] = self.run_decoder(x, tgt_nonpadding, ret, infer, tgt_mels, global_step)
|
||||
return ret
|
||||
|
||||
def run_text_encoder(self, txt_tokens, word_tokens, ph2word, word_len, mel2word, mel2ph, style_embed, ret, **kwargs):
|
||||
word2word = torch.arange(word_len)[None, :].to(ph2word.device) + 1 # [B, T_mel, T_word]
|
||||
src_nonpadding = (txt_tokens > 0).float()[:, :, None]
|
||||
use_bert = hparams.get("use_bert") is True
|
||||
if use_bert:
|
||||
ph_encoder_out = self.ph_encoder(txt_tokens, bert_feats=kwargs['bert_feats'], ph2word=ph2word,
|
||||
graph_lst=kwargs['graph_lst'], etypes_lst=kwargs['etypes_lst'],
|
||||
cl_feats=kwargs['cl_feats'], ret=ret) * src_nonpadding + style_embed
|
||||
else:
|
||||
ph_encoder_out = self.ph_encoder(txt_tokens) * src_nonpadding + style_embed
|
||||
if hparams['use_word_encoder']:
|
||||
word_encoder_out = self.word_encoder(word_tokens) + style_embed
|
||||
ph_encoder_out = ph_encoder_out + expand_states(word_encoder_out, ph2word)
|
||||
if hparams['dur_level'] == 'word':
|
||||
word_encoder_out = 0
|
||||
h_ph_gb_word = group_hidden_by_segs(ph_encoder_out, ph2word, word_len)[0]
|
||||
word_encoder_out = word_encoder_out + self.ph2word_encoder(h_ph_gb_word)
|
||||
if hparams['use_word_encoder']:
|
||||
word_encoder_out = word_encoder_out + self.word_encoder(word_tokens)
|
||||
mel2word = self.forward_dur(ph_encoder_out, mel2word, ret, ph2word=ph2word, word_len=word_len)
|
||||
mel2word = clip_mel2token_to_multiple(mel2word, hparams['frames_multiple'])
|
||||
tgt_nonpadding = (mel2word > 0).float()[:, :, None]
|
||||
enc_pos = self.get_pos_embed(word2word, ph2word) # [B, T_ph, H]
|
||||
dec_pos = self.get_pos_embed(word2word, mel2word) # [B, T_mel, H]
|
||||
dec_word_mask = build_word_mask(mel2word, ph2word) # [B, T_mel, T_ph]
|
||||
x, weight = self.attention(ph_encoder_out, enc_pos, word_encoder_out, dec_pos, mel2word, dec_word_mask)
|
||||
if hparams['add_word_pos']:
|
||||
x = x + self.word_pos_proj(dec_pos)
|
||||
ret['attn'] = weight
|
||||
else:
|
||||
mel2ph = self.forward_dur(ph_encoder_out, mel2ph, ret)
|
||||
mel2ph = clip_mel2token_to_multiple(mel2ph, hparams['frames_multiple'])
|
||||
mel2word = mel2ph_to_mel2word(mel2ph, ph2word)
|
||||
x = expand_states(ph_encoder_out, mel2ph)
|
||||
if hparams['add_word_pos']:
|
||||
dec_pos = self.get_pos_embed(word2word, mel2word) # [B, T_mel, H]
|
||||
x = x + self.word_pos_proj(dec_pos)
|
||||
tgt_nonpadding = (mel2ph > 0).float()[:, :, None]
|
||||
if hparams['use_word_encoder']:
|
||||
x = x + expand_states(word_encoder_out, mel2word)
|
||||
return x, tgt_nonpadding
|
||||
|
||||
def attention(self, ph_encoder_out, enc_pos, word_encoder_out, dec_pos, mel2word, dec_word_mask):
|
||||
ph_kv = self.enc_pos_proj(torch.cat([ph_encoder_out, enc_pos], -1))
|
||||
word_enc_out_expend = expand_states(word_encoder_out, mel2word)
|
||||
word_enc_out_expend = torch.cat([word_enc_out_expend, dec_pos], -1)
|
||||
if hparams['text_encoder_postnet']:
|
||||
word_enc_out_expend = self.dec_res_proj(word_enc_out_expend)
|
||||
word_enc_out_expend = self.text_encoder_postnet(word_enc_out_expend)
|
||||
dec_q = x_res = word_enc_out_expend
|
||||
else:
|
||||
dec_q = self.dec_query_proj(word_enc_out_expend)
|
||||
x_res = self.dec_res_proj(word_enc_out_expend)
|
||||
ph_kv, dec_q = ph_kv.transpose(0, 1), dec_q.transpose(0, 1)
|
||||
x, (weight, _) = self.attn(dec_q, ph_kv, ph_kv, attn_mask=(1 - dec_word_mask) * -1e9)
|
||||
x = x.transpose(0, 1)
|
||||
x = x + x_res
|
||||
return x, weight
|
||||
|
||||
def run_decoder(self, x, tgt_nonpadding, ret, infer, tgt_mels=None, global_step=0):
|
||||
if not hparams['use_fvae']:
|
||||
x = self.decoder(x)
|
||||
x = self.mel_out(x)
|
||||
ret['kl'] = 0
|
||||
return x * tgt_nonpadding
|
||||
else:
|
||||
decoder_inp = x
|
||||
x = x.transpose(1, 2) # [B, H, T]
|
||||
tgt_nonpadding_BHT = tgt_nonpadding.transpose(1, 2) # [B, H, T]
|
||||
if infer:
|
||||
z = self.fvae(cond=x, infer=True)
|
||||
else:
|
||||
tgt_mels = tgt_mels.transpose(1, 2) # [B, 80, T]
|
||||
z, ret['kl'], ret['z_p'], ret['m_q'], ret['logs_q'] = self.fvae(
|
||||
tgt_mels, tgt_nonpadding_BHT, cond=x)
|
||||
if global_step < hparams['posterior_start_steps']:
|
||||
z = torch.randn_like(z)
|
||||
x_recon = self.fvae.decoder(z, nonpadding=tgt_nonpadding_BHT, cond=x).transpose(1, 2)
|
||||
ret['pre_mel_out'] = x_recon
|
||||
return x_recon
|
||||
|
||||
def forward_dur(self, dur_input, mel2word, ret, **kwargs):
|
||||
"""
|
||||
|
||||
:param dur_input: [B, T_txt, H]
|
||||
:param mel2ph: [B, T_mel]
|
||||
:param txt_tokens: [B, T_txt]
|
||||
:param ret:
|
||||
:return:
|
||||
"""
|
||||
src_padding = dur_input.data.abs().sum(-1) == 0
|
||||
dur_input = dur_input.detach() + hparams['predictor_grad'] * (dur_input - dur_input.detach())
|
||||
dur = self.dur_predictor(dur_input, src_padding)
|
||||
if hparams['dur_level'] == 'word':
|
||||
word_len = kwargs['word_len']
|
||||
ph2word = kwargs['ph2word']
|
||||
B, T_ph = ph2word.shape
|
||||
dur = torch.zeros([B, word_len.max() + 1]).to(ph2word.device).scatter_add(1, ph2word, dur)
|
||||
dur = dur[:, 1:]
|
||||
ret['dur'] = dur
|
||||
if mel2word is None:
|
||||
mel2word = self.length_regulator(dur).detach()
|
||||
return mel2word
|
||||
|
||||
def get_pos_embed(self, word2word, x2word):
|
||||
x_pos = build_word_mask(word2word, x2word).float() # [B, T_word, T_ph]
|
||||
x_pos = (x_pos.cumsum(-1) / x_pos.sum(-1).clamp(min=1)[..., None] * x_pos).sum(1)
|
||||
x_pos = self.sin_pos(x_pos.float()) # [B, T_ph, H]
|
||||
return x_pos
|
||||
|
||||
def store_inverse_all(self):
|
||||
def remove_weight_norm(m):
|
||||
try:
|
||||
if hasattr(m, 'store_inverse'):
|
||||
m.store_inverse()
|
||||
nn.utils.remove_weight_norm(m)
|
||||
except ValueError: # this module didn't have weight norm
|
||||
return
|
||||
|
||||
self.apply(remove_weight_norm)
|
||||
@@ -1,75 +0,0 @@
|
||||
import torch
|
||||
import torch.distributions as dist
|
||||
from torch import nn
|
||||
from modules.commons.normalizing_flow.glow_modules import Glow
|
||||
from modules.portaspeech.portaspeech import PortaSpeech
|
||||
from utils.hparams import hparams
|
||||
|
||||
class PortaSpeechFlow(PortaSpeech):
|
||||
def __init__(self, ph_dict_size, word_dict_size, out_dims=None):
|
||||
super().__init__(ph_dict_size, word_dict_size, out_dims)
|
||||
cond_hs = 80
|
||||
if hparams.get('use_txt_cond', True):
|
||||
cond_hs = cond_hs + hparams['hidden_size']
|
||||
if hparams.get('use_latent_cond', False):
|
||||
cond_hs = cond_hs + hparams['latent_size']
|
||||
if hparams['use_cond_proj']:
|
||||
self.g_proj = nn.Conv1d(cond_hs, 160, 5, padding=2)
|
||||
cond_hs = 160
|
||||
self.post_flow = Glow(
|
||||
80, hparams['post_glow_hidden'], hparams['post_glow_kernel_size'], 1,
|
||||
hparams['post_glow_n_blocks'], hparams['post_glow_n_block_layers'],
|
||||
n_split=4, n_sqz=2,
|
||||
gin_channels=cond_hs,
|
||||
share_cond_layers=hparams['post_share_cond_layers'],
|
||||
share_wn_layers=hparams['share_wn_layers'],
|
||||
sigmoid_scale=hparams['sigmoid_scale']
|
||||
)
|
||||
self.prior_dist = dist.Normal(0, 1)
|
||||
|
||||
def forward(self, txt_tokens, word_tokens, ph2word, word_len, mel2word=None, mel2ph=None,
|
||||
spk_embed=None, spk_id=None, pitch=None, infer=False, tgt_mels=None,
|
||||
forward_post_glow=True, two_stage=True, global_step=None, **kwargs):
|
||||
is_training = self.training
|
||||
train_fvae = not (forward_post_glow and two_stage)
|
||||
if not train_fvae:
|
||||
self.eval()
|
||||
with torch.set_grad_enabled(mode=train_fvae):
|
||||
ret = super(PortaSpeechFlow, self).forward(
|
||||
txt_tokens, word_tokens, ph2word, word_len, mel2word, mel2ph,
|
||||
spk_embed, spk_id, pitch, infer, tgt_mels, global_step, **kwargs)
|
||||
if (forward_post_glow or not two_stage) and hparams['use_post_flow']:
|
||||
self.run_post_glow(tgt_mels, infer, is_training, ret)
|
||||
return ret
|
||||
|
||||
def run_post_glow(self, tgt_mels, infer, is_training, ret):
|
||||
x_recon = ret['mel_out'].transpose(1, 2)
|
||||
g = x_recon
|
||||
B, _, T = g.shape
|
||||
if hparams.get('use_txt_cond', True):
|
||||
g = torch.cat([g, ret['decoder_inp'].transpose(1, 2)], 1)
|
||||
if hparams.get('use_latent_cond', False):
|
||||
g_z = ret['z_p'][:, :, :, None].repeat(1, 1, 1, 4).reshape(B, -1, T)
|
||||
g = torch.cat([g, g_z], 1)
|
||||
if hparams['use_cond_proj']:
|
||||
g = self.g_proj(g)
|
||||
prior_dist = self.prior_dist
|
||||
if not infer:
|
||||
if is_training:
|
||||
self.post_flow.train()
|
||||
nonpadding = ret['nonpadding'].transpose(1, 2)
|
||||
y_lengths = nonpadding.sum(-1)
|
||||
if hparams['detach_postflow_input']:
|
||||
g = g.detach()
|
||||
tgt_mels = tgt_mels.transpose(1, 2)
|
||||
z_postflow, ldj = self.post_flow(tgt_mels, nonpadding, g=g)
|
||||
ldj = ldj / y_lengths / 80
|
||||
ret['z_pf'], ret['ldj_pf'] = z_postflow, ldj
|
||||
ret['postflow'] = -prior_dist.log_prob(z_postflow).mean() - ldj.mean()
|
||||
if torch.isnan(ret['postflow']):
|
||||
ret['postflow'] = None
|
||||
else:
|
||||
nonpadding = torch.ones_like(x_recon[:, :1, :])
|
||||
z_post = torch.randn(x_recon.shape).to(g.device) * hparams['noise_scale']
|
||||
x_recon, _ = self.post_flow(z_post, nonpadding, g, reverse=True)
|
||||
ret['mel_out'] = x_recon.transpose(1, 2)
|
||||
23
README.md
23
README.md
@@ -2,23 +2,26 @@
|
||||
|
||||
**AudioGPT** connects ChatGPT and a series of Audio Foundation Models to enable **sending** and **receiving** speech, sing, audio, and talking head during chatting.
|
||||
|
||||
<a src="https://img.shields.io/badge/%F0%9F%A4%97-Open%20in%20Spaces-blue" href="https://huggingface.co/spaces/AIGC-Audio/AudioGPT">
|
||||
<img src="https://img.shields.io/badge/%F0%9F%A4%97-Open%20in%20Spaces-blue" alt="Open in Spaces">
|
||||
</a>
|
||||
|
||||
## Capabilities
|
||||
|
||||
Up-to-date link: https://93868c7fa583f4b5.gradio.app
|
||||
Up-to-date link: https://cdb7b543afd1c8e8.gradio.app
|
||||
|
||||
Here we list the capability of AudioGPT at this time. More supported models and tasks are comming soon. For prompt examples, refer to [asset](assets/README.md).
|
||||
|
||||
### Speech
|
||||
| Task | Supported Foundation Models | Status |
|
||||
|:-------------------------:|:-------------------------------:|:------:|
|
||||
|:--------------------------:|:-------------------------------:|:------:|
|
||||
| Text-to-Speech | [FastSpeech](), [SyntaSpeech](), [VITS]() | Yes (WIP) |
|
||||
| Style Transfer | [GenerSpeech]() | Yes |
|
||||
| Speech Recognition | [whisper](), [Conformer]() | Yes |
|
||||
| Speech Enhancement | [ConvTasNet]() | WIP |
|
||||
| Speech Separation | [TF-GridNet]() | WIP |
|
||||
| Speech Translation | [Multi-decoder]() | WIP |
|
||||
| Mono-to-Binaural Speech | [NeuralWarp]() | Yes |
|
||||
| Mono-to-Binaural | [NeuralWarp]() | Yes |
|
||||
|
||||
### Sing
|
||||
|
||||
@@ -28,13 +31,13 @@ Here we list the capability of AudioGPT at this time. More supported models and
|
||||
|
||||
### Audio
|
||||
| Task | Supported Foundation Models | Status |
|
||||
|:----------------:|:---------------------------:|:---------:|
|
||||
|:----------------------:|:---------------------------:|:------:|
|
||||
| Text-to-Audio | [Make-An-Audio]() | Yes |
|
||||
| Audio Inpainting | [Make-An-Audio]() | Yes |
|
||||
| Image-to-Audio | [Make-An-Audio]() | Yes |
|
||||
| Sound Detection | [Audio-transformer]() | Yes (WIP) |
|
||||
| Target sound detection | [TSDNet]() | Yes (WIP) |
|
||||
| Sound Extraction | [LASSNet]() | Yes (WIP) |
|
||||
| Sound Detection | [Audio-transformer]() | Yes |
|
||||
| Target Sound Detection | [TSDNet]() | Yes |
|
||||
| Sound Extraction | [LASSNet]() | Yes |
|
||||
|
||||
|
||||
### Talking Head
|
||||
@@ -44,7 +47,8 @@ Here we list the capability of AudioGPT at this time. More supported models and
|
||||
| Talking Head Synthesis | [GeneFace]() | Yes (WIP) |
|
||||
|
||||
## Internal Version Updates
|
||||
4.3 Support Talking Head Synthesis\
|
||||
4.6 Support Sound Extraction/Detection\
|
||||
4.3 Support huggingface demo space\
|
||||
4.1 Support Audio inpainting and clean codes\
|
||||
3.27 Support Style Transfer/Talking head Synthesis\
|
||||
3.23 Support Text-to-Sing\
|
||||
@@ -54,10 +58,9 @@ Here we list the capability of AudioGPT at this time. More supported models and
|
||||
|
||||
## Todo
|
||||
- [x] clean text to sing/speech code
|
||||
- [ ] import Espnet models for speech tasks
|
||||
- [ ] merge talking head synthesis into main
|
||||
- [x] change audio/video log output
|
||||
- [ ] support huggingface space
|
||||
- [x] support huggingface space
|
||||
|
||||
## Acknowledgement
|
||||
We appreciate the open source of the following projects:
|
||||
|
||||
@@ -66,13 +66,8 @@ Input Example : Please tell me the text description of this audio.<br />
|
||||
Output:<br />
|
||||
<br />
|
||||
|
||||
## Image
|
||||
### Text-To-Image
|
||||
Input Example : Generate an image of a horse<br />
|
||||
Output:<br />
|
||||
<br />
|
||||
|
||||
## Sound Detection
|
||||
### Sound Detection
|
||||
First upload your audio(.wav)<br />
|
||||
Audio Example :<br />
|
||||
<audio src="mix.wav" controls></audio><br />
|
||||
@@ -80,21 +75,21 @@ Input Example : What events does this audio include?<br />
|
||||
Output:<br />
|
||||
<br />
|
||||
|
||||
## Mono audio to Binaural Audio
|
||||
### Mono audio to Binaural Audio
|
||||
First upload your audio(.wav)<br />
|
||||
<audio src="mix.wav" controls></audio><br />
|
||||
Input Example: Transfer the mono speech to a binaural one audio.<br />
|
||||
Output:<br />
|
||||
<br />
|
||||
|
||||
## Target Sound Detection
|
||||
### Target Sound Detection
|
||||
Fisrt upload your audio(.wav)<br />
|
||||
<audio src="mix.wav" controls></audio><br />
|
||||
Input Example: please help me detect the target sound in the audio based on desription: “I want to detect Applause event”<br />
|
||||
Output:<br />
|
||||
<br />
|
||||
|
||||
## Sound Extraction
|
||||
### Sound Extraction
|
||||
First upload your audio(.wav)<br />
|
||||
<audio src="mix.wav" controls></audio><br />
|
||||
Input Example: Please help me extract the sound events from the audio based on the description: "a person shouts nearby and then emergency vehicle sirens sounds"<br />
|
||||
|
||||
@@ -51,16 +51,16 @@ from target_sound_detection.src.models import event_labels
|
||||
from target_sound_detection.src.utils import median_filter, decode_with_timestamps
|
||||
import clip
|
||||
import numpy as np
|
||||
AUDIO_CHATGPT_PREFIX = """Audio ChatGPT
|
||||
AUdio ChatGPT can not directly read audios, but it has a list of tools to finish different audio synthesis tasks. Each audio will have a file name formed as "audio/xxx.wav". When talking about audios, Audio ChatGPT is very strict to the file name and will never fabricate nonexistent files.
|
||||
AUdio ChatGPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the audio content and audio file name. It will remember to provide the file name from the last tool observation, if a new audio is generated.
|
||||
Human may provide Audio ChatGPT with a description. Audio ChatGPT should generate audios according to this description rather than directly imagine from memory or yourself."
|
||||
|
||||
AUDIO_CHATGPT_PREFIX = """AudioGPT
|
||||
AudioGPT can not directly read audios, but it has a list of tools to finish different speech, audio, and singing voice tasks. Each audio will have a file name formed as "audio/xxx.wav". When talking about audios, AudioGPT is very strict to the file name and will never fabricate nonexistent files.
|
||||
AudioGPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the audio content and audio file name. It will remember to provide the file name from the last tool observation, if a new audio is generated.
|
||||
Human may provide new audios to AudioGPT with a description. The description helps AudioGPT to understand this audio, but AudioGPT should use tools to finish following tasks, rather than directly imagine from the description.
|
||||
Overall, AudioGPT is a powerful audio dialogue assistant tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics.
|
||||
|
||||
TOOLS:
|
||||
------
|
||||
|
||||
Audio ChatGPT has access to the following tools:"""
|
||||
AudioGPT has access to the following tools:"""
|
||||
|
||||
AUDIO_CHATGPT_FORMAT_INSTRUCTIONS = """To use a tool, please use the following format:
|
||||
|
||||
@@ -238,7 +238,7 @@ class I2A:
|
||||
image = Image.open(image)
|
||||
image = self.sampler.model.cond_stage_model.preprocess(image).unsqueeze(0)
|
||||
image_embedding = self.sampler.model.cond_stage_model.forward_img(image)
|
||||
c = image_embedding.repeat(n_samples, 1, 1)# shape:[1,77,1280],即还没有变成句子embedding,仍是每个单词的embedding
|
||||
c = image_embedding.repeat(n_samples, 1, 1)
|
||||
shape = [self.sampler.model.first_stage_model.embed_dim, H//8, W//8] # (z_dim, 80//2^x, 848//2^x)
|
||||
samples_ddim, _ = self.sampler.sample(S=ddim_steps,
|
||||
conditioning=c,
|
||||
@@ -396,9 +396,9 @@ class Inpaint:
|
||||
sr, ori_wav = wavfile.read(input_audio_path)
|
||||
print("gen_mel")
|
||||
print(sr,ori_wav.shape,ori_wav)
|
||||
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0 # order='C'是以C语言格式存储,不用管
|
||||
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0
|
||||
if len(ori_wav.shape)==2:# stereo
|
||||
ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
|
||||
ori_wav = librosa.to_mono(ori_wav.T)
|
||||
print(sr,ori_wav.shape,ori_wav)
|
||||
ori_wav = librosa.resample(ori_wav,orig_sr = sr,target_sr = SAMPLE_RATE)
|
||||
|
||||
@@ -417,9 +417,9 @@ class Inpaint:
|
||||
print("gen_mel_audio")
|
||||
print(sr,ori_wav.shape,ori_wav)
|
||||
|
||||
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0 # order='C'是以C语言格式存储,不用管
|
||||
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0
|
||||
if len(ori_wav.shape)==2:# stereo
|
||||
ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
|
||||
ori_wav = librosa.to_mono(ori_wav.T)
|
||||
print(sr,ori_wav.shape,ori_wav)
|
||||
ori_wav = librosa.resample(ori_wav,orig_sr = sr,target_sr = SAMPLE_RATE)
|
||||
|
||||
@@ -432,7 +432,7 @@ class Inpaint:
|
||||
mel = TRANSFORMS_16000(input_wav)
|
||||
return mel
|
||||
def show_mel_fn(self, input_audio_path):
|
||||
crop_len = 500 # the full mel cannot be showed due to gradio's Image bug when using tool='sketch'
|
||||
crop_len = 500
|
||||
crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
|
||||
color_mel = self.cmap_transform(crop_mel)
|
||||
image = Image.fromarray((color_mel*255).astype(np.uint8))
|
||||
@@ -473,11 +473,11 @@ class Inpaint:
|
||||
torch.set_grad_enabled(False)
|
||||
mel_img = Image.open(mel_and_mask['image'])
|
||||
mask_img = Image.open(mel_and_mask["mask"])
|
||||
show_mel = np.array(mel_img.convert("L"))/255 # 由于展示的mel只展示了一部分,所以需要重新从音频生成mel
|
||||
show_mel = np.array(mel_img.convert("L"))/255
|
||||
mask = np.array(mask_img.convert("L"))/255
|
||||
mel_bins,mel_len = 80,848
|
||||
input_mel = self.gen_mel_audio(input_audio)[:,:mel_len]# 由于展示的mel只展示了一部分,所以需要重新从音频生成mel
|
||||
mask = np.pad(mask,((0,0),(0,mel_len-mask.shape[1])),mode='constant',constant_values=0)# 将mask填充到原来的mel的大小
|
||||
input_mel = self.gen_mel_audio(input_audio)[:,:mel_len]
|
||||
mask = np.pad(mask,((0,0),(0,mel_len-mask.shape[1])),mode='constant',constant_values=0)
|
||||
print(mask.shape,input_mel.shape)
|
||||
with torch.no_grad():
|
||||
batch = self.make_batch_sd(input_mel,mask,num_samples=1)
|
||||
@@ -781,7 +781,7 @@ class TargetSoundDetection:
|
||||
|
||||
class ConversationBot:
|
||||
def __init__(self):
|
||||
print("Initializing AudioChatGPT")
|
||||
print("Initializing AudioGPT")
|
||||
self.llm = OpenAI(temperature=0)
|
||||
self.t2i = T2I(device="cuda:0")
|
||||
self.i2t = ImageCaptioning(device="cuda:1")
|
||||
@@ -820,19 +820,19 @@ class ConversationBot:
|
||||
"Or Like: Generate a piece of singing voice. Text is xxx, note is xxx, duration is xxx."
|
||||
"The input to this tool should be a comma seperated string of three, representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided."),
|
||||
Tool(name="Synthesize Speech Given the User Input Text", func=self.tts.inference,
|
||||
description="useful for when you want to convert a user input text into speech audio it saved it to a file."
|
||||
description="useful for when you want to convert a user input text into speech and saved it to a file."
|
||||
"The input to this tool should be a string, representing the text used to be converted to speech."),
|
||||
Tool(name="Generate Audio From The Image", func=self.i2a.inference,
|
||||
description="useful for when you want to generate an audio based on an image."
|
||||
"The input to this tool should be a string, representing the image_path. "),
|
||||
"The input to this tool should be a string, representing the image path. "),
|
||||
Tool(name="Generate Text From The Audio", func=self.a2t.inference,
|
||||
description="useful for when you want to describe an audio in text, receives audio_path as input."
|
||||
"The input to this tool should be a string, representing the audio_path."),
|
||||
description="useful for when you want to generate description of an audio or know what is inside the audio."
|
||||
"The input to this tool should be a string, representing the audio path."),
|
||||
Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
|
||||
description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
|
||||
"The input to this tool should be a string, representing the audio_path."),
|
||||
description="useful for when you want to inpaint or manipulate an audio, this tool receives audio path as input, "
|
||||
"The input to this tool should be a string, representing the audio path."),
|
||||
Tool(name="Transcribe speech", func=self.asr.inference,
|
||||
description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
|
||||
description="useful for when you want to know the content and transcription corresponding to a human speech, receives audio_path as input."
|
||||
"The input to this tool should be a string, representing the audio_path."),
|
||||
Tool(name="Detect the sound event from the audio", func=self.detection.inference,
|
||||
description="useful for when you want to know what event in the audio and the sound event start or end time, receives audio_path as input. "
|
||||
@@ -959,8 +959,8 @@ if __name__ == '__main__':
|
||||
bot = ConversationBot()
|
||||
with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
|
||||
with gr.Row():
|
||||
gr.Markdown("## Audio ChatGPT")
|
||||
chatbot = gr.Chatbot(elem_id="chatbot", label="Audio ChatGPT")
|
||||
gr.Markdown("## AudioGPT")
|
||||
chatbot = gr.Chatbot(elem_id="chatbot", label="AudioGPT")
|
||||
state = gr.State([])
|
||||
with gr.Row():
|
||||
with gr.Column(scale=0.7):
|
||||
|
||||
@@ -32,7 +32,7 @@ wget -P text_to_speech/checkpoints/ljspeech/ps_adv_baseline -i https://huggingfa
|
||||
wget -P audio_to_text/audiocaps_cntrstv_cnn14rnn_trm -i https://huggingface.co/AIGC-Audio/AudioGPT/blob/main/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/config.yaml https://huggingface.co/AIGC-Audio/AudioGPT/resolve/main/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/swa.pth
|
||||
wget -P audio_to_text/clotho_cntrstv_cnn14rnn_trm -i https://huggingface.co/AIGC-Audio/AudioGPT/blob/main/audio_to_text/clotho_cntrstv_cnn14rnn_trm/config.yaml https://huggingface.co/AIGC-Audio/AudioGPT/resolve/main/audio_to_text/clotho_cntrstv_cnn14rnn_trm/swa.pth
|
||||
wget -P audio_to_text/pretrained_feature_extractors https://huggingface.co/AIGC-Audio/AudioGPT/resolve/main/audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth
|
||||
# audio detection
|
||||
# Audio detection
|
||||
cd audio_detection/audio_infer/useful_ckpts
|
||||
wget https://huggingface.co/Dongchao/pre_trained_model/resolve/main/audio_detection.pth
|
||||
cd mono2binaural/useful_ckpts
|
||||
|
||||
Reference in New Issue
Block a user