Merge pull request #15 from Rongjiehuang/main

update
This commit is contained in:
Rongjiehuang
2023-04-09 16:07:37 +07:00
committed by GitHub
10 changed files with 55 additions and 760 deletions

0
LICENSE Normal file
View File

View File

@@ -1,9 +1 @@
---
title: DiffSinger🎶 Diffusion for Singing Voice Synthesis
emoji: 🎶
colorFrom: purple
colorTo: blue
sdk: gradio
app_file: "inference/svs/gradio/infer.py"
pinned: false
---
In this directory, we support FastSpeech, GenerSpeech, SyntaSpeech, DiffSinger

View File

@@ -1,188 +0,0 @@
from utils.hparams import hparams
import torch
from torch import nn
import torch.nn.functional as F
from modules.commons.conv import TextConvEncoder, ConvBlocks
from modules.commons.common_layers import Embedding
from modules.fastspeech.tts_modules import LayerNorm, PitchPredictor, LengthRegulator
from modules.commons.rel_transformer import RelTransformerEncoder, BERTRelTransformerEncoder
from modules.commons.align_ops import clip_mel2token_to_multiple, expand_states
from utils.pitch_utils import denorm_f0, f0_to_coarse
FS_ENCODERS = {
'rel_fft': lambda hp, dict: RelTransformerEncoder(
len(dict), hp['hidden_size'], hp['hidden_size'],
hp['ffn_hidden_size'], hp['num_heads'], hp['enc_layers'],
hp['enc_ffn_kernel_size'], hp['dropout'], prenet=hp['enc_prenet'], pre_ln=hp['enc_pre_ln']),
}
FS_DECODERS = {
'conv': lambda hp: ConvBlocks(hp['hidden_size'], hp['hidden_size'], hp['dec_dilations'],
hp['dec_kernel_size'], layers_in_block=hp['layers_in_block'],
norm_type=hp['enc_dec_norm'], dropout=hp['dropout'],
post_net_kernel=hp.get('dec_post_net_kernel', 3)),
}
class DurationPredictor(torch.nn.Module):
def __init__(self, idim, n_layers=2, n_chans=384, kernel_size=3, dropout_rate=0.1, offset=1.0):
super(DurationPredictor, self).__init__()
self.offset = offset
self.conv = torch.nn.ModuleList()
self.kernel_size = kernel_size
for idx in range(n_layers):
in_chans = idim if idx == 0 else n_chans
self.conv += [torch.nn.Sequential(
torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2),
torch.nn.ReLU(),
LayerNorm(n_chans, dim=1),
torch.nn.Dropout(dropout_rate)
)]
self.linear = nn.Sequential(torch.nn.Linear(n_chans, 1), nn.Softplus())
def forward(self, x, x_padding=None):
x = x.transpose(1, -1) # (B, idim, Tmax)
for f in self.conv:
x = f(x) # (B, C, Tmax)
if x_padding is not None:
x = x * (1 - x_padding.float())[:, None, :]
x = self.linear(x.transpose(1, -1)) # [B, T, C]
x = x * (1 - x_padding.float())[:, :, None] # (B, T, C)
x = x[..., 0] # (B, Tmax)
return x
class FastSpeech(nn.Module):
def __init__(self, dict_size, out_dims=None):
super().__init__()
self.enc_layers = hparams['enc_layers']
self.dec_layers = hparams['dec_layers']
self.hidden_size = hparams['hidden_size']
if hparams.get("use_bert") is True:
self.ph_encoder = BERTRelTransformerEncoder(dict_size, hparams['hidden_size'], hparams['hidden_size'],
hparams['ffn_hidden_size'], hparams['num_heads'], hparams['enc_layers'],
hparams['enc_ffn_kernel_size'], hparams['dropout'], prenet=hparams['enc_prenet'], pre_ln=hparams['enc_pre_ln'])
else:
self.ph_encoder = FS_ENCODERS[hparams['encoder_type']](hparams, dict_size)
self.decoder = FS_DECODERS[hparams['decoder_type']](hparams)
self.out_dims = hparams['audio_num_mel_bins'] if out_dims is None else out_dims
self.mel_out = nn.Linear(self.hidden_size, self.out_dims, bias=True)
if hparams['use_spk_id']:
self.spk_id_proj = Embedding(hparams['num_spk'], self.hidden_size)
if hparams['use_spk_embed']:
self.spk_embed_proj = nn.Linear(256, self.hidden_size, bias=True)
predictor_hidden = hparams['predictor_hidden'] if hparams['predictor_hidden'] > 0 else self.hidden_size
self.dur_predictor = DurationPredictor(
self.hidden_size,
n_chans=predictor_hidden,
n_layers=hparams['dur_predictor_layers'],
dropout_rate=hparams['predictor_dropout'],
kernel_size=hparams['dur_predictor_kernel'])
self.length_regulator = LengthRegulator()
if hparams['use_pitch_embed']:
self.pitch_embed = Embedding(300, self.hidden_size, 0)
self.pitch_predictor = PitchPredictor(
self.hidden_size, n_chans=predictor_hidden,
n_layers=5, dropout_rate=0.1, odim=2,
kernel_size=hparams['predictor_kernel'])
if hparams['dec_inp_add_noise']:
self.z_channels = hparams['z_channels']
self.dec_inp_noise_proj = nn.Linear(self.hidden_size + self.z_channels, self.hidden_size)
def forward(self, txt_tokens, mel2ph=None, spk_embed=None, spk_id=None,
f0=None, uv=None, infer=False, **kwargs):
ret = {}
src_nonpadding = (txt_tokens > 0).float()[:, :, None]
style_embed = self.forward_style_embed(spk_embed, spk_id)
use_bert = hparams.get("use_bert") is True
if use_bert:
encoder_out = self.encoder(txt_tokens, bert_feats=kwargs['bert_feats'], ph2word=kwargs['ph2word'],
ret=ret) * src_nonpadding + style_embed
else:
encoder_out = self.encoder(txt_tokens) * src_nonpadding + style_embed
# add dur
dur_inp = (encoder_out + style_embed) * src_nonpadding
mel2ph = self.forward_dur(dur_inp, mel2ph, txt_tokens, ret)
tgt_nonpadding = (mel2ph > 0).float()[:, :, None]
decoder_inp = expand_states(encoder_out, mel2ph)
# add pitch embed
if hparams['use_pitch_embed']:
pitch_inp = (decoder_inp + style_embed) * tgt_nonpadding
decoder_inp = decoder_inp + self.forward_pitch(pitch_inp, f0, uv, mel2ph, ret, encoder_out)
# decoder input
ret['decoder_inp'] = decoder_inp = (decoder_inp + style_embed) * tgt_nonpadding
if hparams['dec_inp_add_noise']:
B, T, _ = decoder_inp.shape
z = kwargs.get('adv_z', torch.randn([B, T, self.z_channels])).to(decoder_inp.device)
ret['adv_z'] = z
decoder_inp = torch.cat([decoder_inp, z], -1)
decoder_inp = self.dec_inp_noise_proj(decoder_inp) * tgt_nonpadding
ret['mel_out'] = self.forward_decoder(decoder_inp, tgt_nonpadding, ret, infer=infer, **kwargs)
return ret
def forward_style_embed(self, spk_embed=None, spk_id=None):
# add spk embed
style_embed = 0
if hparams['use_spk_embed']:
style_embed = style_embed + self.spk_embed_proj(spk_embed)[:, None, :]
if hparams['use_spk_id']:
style_embed = style_embed + self.spk_id_proj(spk_id)[:, None, :]
return style_embed
def forward_dur(self, dur_input, mel2ph, txt_tokens, ret):
"""
:param dur_input: [B, T_txt, H]
:param mel2ph: [B, T_mel]
:param txt_tokens: [B, T_txt]
:param ret:
:return:
"""
src_padding = txt_tokens == 0
if hparams['predictor_grad'] != 1:
dur_input = dur_input.detach() + hparams['predictor_grad'] * (dur_input - dur_input.detach())
dur = self.dur_predictor(dur_input, src_padding)
ret['dur'] = dur
if mel2ph is None:
mel2ph = self.length_regulator(dur, src_padding).detach()
ret['mel2ph'] = mel2ph = clip_mel2token_to_multiple(mel2ph, hparams['frames_multiple'])
return mel2ph
def forward_pitch(self, decoder_inp, f0, uv, mel2ph, ret, encoder_out=None):
if hparams['pitch_type'] == 'frame':
pitch_pred_inp = decoder_inp
pitch_padding = mel2ph == 0
else:
pitch_pred_inp = encoder_out
pitch_padding = encoder_out.abs().sum(-1) == 0
uv = None
if hparams['predictor_grad'] != 1:
pitch_pred_inp = pitch_pred_inp.detach() + \
hparams['predictor_grad'] * (pitch_pred_inp - pitch_pred_inp.detach())
ret['pitch_pred'] = pitch_pred = self.pitch_predictor(pitch_pred_inp)
use_uv = hparams['pitch_type'] == 'frame' and hparams['use_uv']
if f0 is None:
f0 = pitch_pred[:, :, 0]
if use_uv:
uv = pitch_pred[:, :, 1] > 0
f0_denorm = denorm_f0(f0, uv if use_uv else None, pitch_padding=pitch_padding)
pitch = f0_to_coarse(f0_denorm) # start from 0 [B, T_txt]
ret['f0_denorm'] = f0_denorm
ret['f0_denorm_pred'] = denorm_f0(
pitch_pred[:, :, 0], (pitch_pred[:, :, 1] > 0) if use_uv else None,
pitch_padding=pitch_padding)
if hparams['pitch_type'] == 'ph':
pitch = torch.gather(F.pad(pitch, [1, 0]), 1, mel2ph)
ret['f0_denorm'] = torch.gather(F.pad(ret['f0_denorm'], [1, 0]), 1, mel2ph)
ret['f0_denorm_pred'] = torch.gather(F.pad(ret['f0_denorm_pred'], [1, 0]), 1, mel2ph)
pitch_embed = self.pitch_embed(pitch)
return pitch_embed
def forward_decoder(self, decoder_inp, tgt_nonpadding, ret, infer, **kwargs):
x = decoder_inp # [B, T, H]
x = self.decoder(x)
x = self.mel_out(x)
return x * tgt_nonpadding

View File

@@ -1,202 +0,0 @@
import numpy as np
import torch
import torch.distributions as dist
from torch import nn
from modules.commons.conv import ConditionalConvBlocks
from modules.commons.normalizing_flow.res_flow import ResFlow
from modules.commons.wavenet import WN
class FVAEEncoder(nn.Module):
def __init__(self, c_in, hidden_size, c_latent, kernel_size,
n_layers, c_cond=0, p_dropout=0, strides=[4], nn_type='wn'):
super().__init__()
self.strides = strides
self.hidden_size = hidden_size
if np.prod(strides) == 1:
self.pre_net = nn.Conv1d(c_in, hidden_size, kernel_size=1)
else:
self.pre_net = nn.Sequential(*[
nn.Conv1d(c_in, hidden_size, kernel_size=s * 2, stride=s, padding=s // 2)
if i == 0 else
nn.Conv1d(hidden_size, hidden_size, kernel_size=s * 2, stride=s, padding=s // 2)
for i, s in enumerate(strides)
])
if nn_type == 'wn':
self.nn = WN(hidden_size, kernel_size, 1, n_layers, c_cond, p_dropout)
elif nn_type == 'conv':
self.nn = ConditionalConvBlocks(
hidden_size, c_cond, hidden_size, None, kernel_size,
layers_in_block=2, is_BTC=False, num_layers=n_layers)
self.out_proj = nn.Conv1d(hidden_size, c_latent * 2, 1)
self.latent_channels = c_latent
def forward(self, x, nonpadding, cond):
x = self.pre_net(x)
nonpadding = nonpadding[:, :, ::np.prod(self.strides)][:, :, :x.shape[-1]]
x = x * nonpadding
x = self.nn(x, nonpadding=nonpadding, cond=cond) * nonpadding
x = self.out_proj(x)
m, logs = torch.split(x, self.latent_channels, dim=1)
z = (m + torch.randn_like(m) * torch.exp(logs))
return z, m, logs, nonpadding
class FVAEDecoder(nn.Module):
def __init__(self, c_latent, hidden_size, out_channels, kernel_size,
n_layers, c_cond=0, p_dropout=0, strides=[4], nn_type='wn'):
super().__init__()
self.strides = strides
self.hidden_size = hidden_size
self.pre_net = nn.Sequential(*[
nn.ConvTranspose1d(c_latent, hidden_size, kernel_size=s, stride=s)
if i == 0 else
nn.ConvTranspose1d(hidden_size, hidden_size, kernel_size=s, stride=s)
for i, s in enumerate(strides)
])
if nn_type == 'wn':
self.nn = WN(hidden_size, kernel_size, 1, n_layers, c_cond, p_dropout)
elif nn_type == 'conv':
self.nn = ConditionalConvBlocks(
hidden_size, c_cond, hidden_size, [1] * n_layers, kernel_size,
layers_in_block=2, is_BTC=False)
self.out_proj = nn.Conv1d(hidden_size, out_channels, 1)
def forward(self, x, nonpadding, cond):
x = self.pre_net(x)
x = x * nonpadding
x = self.nn(x, nonpadding=nonpadding, cond=cond) * nonpadding
x = self.out_proj(x)
return x
class FVAE(nn.Module):
def __init__(self,
c_in_out, hidden_size, c_latent,
kernel_size, enc_n_layers, dec_n_layers, c_cond, strides,
use_prior_flow, flow_hidden=None, flow_kernel_size=None, flow_n_steps=None,
encoder_type='wn', decoder_type='wn'):
super(FVAE, self).__init__()
self.strides = strides
self.hidden_size = hidden_size
self.latent_size = c_latent
self.use_prior_flow = use_prior_flow
if np.prod(strides) == 1:
self.g_pre_net = nn.Conv1d(c_cond, c_cond, kernel_size=1)
else:
self.g_pre_net = nn.Sequential(*[
nn.Conv1d(c_cond, c_cond, kernel_size=s * 2, stride=s, padding=s // 2)
for i, s in enumerate(strides)
])
self.encoder = FVAEEncoder(c_in_out, hidden_size, c_latent, kernel_size,
enc_n_layers, c_cond, strides=strides, nn_type=encoder_type)
if use_prior_flow:
self.prior_flow = ResFlow(
c_latent, flow_hidden, flow_kernel_size, flow_n_steps, 4, c_cond=c_cond)
self.decoder = FVAEDecoder(c_latent, hidden_size, c_in_out, kernel_size,
dec_n_layers, c_cond, strides=strides, nn_type=decoder_type)
self.prior_dist = dist.Normal(0, 1)
def forward(self, x=None, nonpadding=None, cond=None, infer=False, noise_scale=1.0, **kwargs):
"""
:param x: [B, C_in_out, T]
:param nonpadding: [B, 1, T]
:param cond: [B, C_g, T]
:return:
"""
if nonpadding is None:
nonpadding = 1
cond_sqz = self.g_pre_net(cond)
if not infer:
z_q, m_q, logs_q, nonpadding_sqz = self.encoder(x, nonpadding, cond_sqz)
q_dist = dist.Normal(m_q, logs_q.exp())
if self.use_prior_flow:
logqx = q_dist.log_prob(z_q)
z_p = self.prior_flow(z_q, nonpadding_sqz, cond_sqz)
logpx = self.prior_dist.log_prob(z_p)
loss_kl = ((logqx - logpx) * nonpadding_sqz).sum() / nonpadding_sqz.sum() / logqx.shape[1]
else:
loss_kl = torch.distributions.kl_divergence(q_dist, self.prior_dist)
loss_kl = (loss_kl * nonpadding_sqz).sum() / nonpadding_sqz.sum() / z_q.shape[1]
z_p = None
return z_q, loss_kl, z_p, m_q, logs_q
else:
latent_shape = [cond_sqz.shape[0], self.latent_size, cond_sqz.shape[2]]
z_p = torch.randn(latent_shape).to(cond.device) * noise_scale
if self.use_prior_flow:
z_p = self.prior_flow(z_p, 1, cond_sqz, reverse=True)
return z_p
class SyntaFVAE(nn.Module):
def __init__(self,
c_in_out, hidden_size, c_latent,
kernel_size, enc_n_layers, dec_n_layers, c_cond, strides,
use_prior_flow, flow_hidden=None, flow_kernel_size=None, flow_n_steps=None,
encoder_type='wn', decoder_type='wn'):
super(SyntaFVAE, self).__init__()
self.strides = strides
self.hidden_size = hidden_size
self.latent_size = c_latent
self.use_prior_flow = use_prior_flow
if np.prod(strides) == 1:
self.g_pre_net = nn.Conv1d(c_cond, c_cond, kernel_size=1)
else:
self.g_pre_net = nn.Sequential(*[
nn.Conv1d(c_cond, c_cond, kernel_size=s * 2, stride=s, padding=s // 2)
for i, s in enumerate(strides)
])
self.encoder = FVAEEncoder(c_in_out, hidden_size, c_latent, kernel_size,
enc_n_layers, c_cond, strides=strides, nn_type=encoder_type)
if use_prior_flow:
self.prior_flow = ResFlow(
c_latent, flow_hidden, flow_kernel_size, flow_n_steps, 4, c_cond=c_cond)
self.decoder = FVAEDecoder(c_latent, hidden_size, c_in_out, kernel_size,
dec_n_layers, c_cond, strides=strides, nn_type=decoder_type)
self.prior_dist = dist.Normal(0, 1)
self.graph_encoder = GraphAuxEnc(in_dim=hidden_size, hid_dim=hidden_size,out_dim=hidden_size)
def forward(self, x=None, nonpadding=None, cond=None, infer=False, noise_scale=1.0,
mel2word=None, ph2word=None, graph_lst=None, etypes_lst=None):
"""
:param x: target mel, [B, C_in_out, T]
:param nonpadding: [B, 1, T]
:param cond: phoneme encoding, [B, C_g, T]
:return:
"""
word_len = ph2word.max(dim=1)[0]
ph_encoding_for_graph = cond.detach() + 0.1 * (cond - cond.detach()) # only 0.1x grad can pass through
_, ph_out_word_encoding_for_graph = GraphAuxEnc.ph_encoding_to_word_encoding(ph_encoding_for_graph.transpose(1,2), mel2word, word_len)
t_m = mel2word.shape[-1]
g_graph = self.graph_encoder.word_forward(graph_lst=graph_lst, word_encoding=ph_out_word_encoding_for_graph, etypes_lst=etypes_lst)
g_graph = g_graph.transpose(1,2)
g_graph = GraphAuxEnc._postprocess_word2ph(g_graph,mel2word,t_m)
g_graph = g_graph.transpose(1,2)
cond = cond + g_graph * 1.
if nonpadding is None:
nonpadding = 1
cond_sqz = self.g_pre_net(cond)
if not infer:
z_q, m_q, logs_q, nonpadding_sqz = self.encoder(x, nonpadding, cond_sqz)
q_dist = dist.Normal(m_q, logs_q.exp())
if self.use_prior_flow:
logqx = q_dist.log_prob(z_q)
z_p = self.prior_flow(z_q, nonpadding_sqz, cond_sqz)
logpx = self.prior_dist.log_prob(z_p)
loss_kl = ((logqx - logpx) * nonpadding_sqz).sum() / nonpadding_sqz.sum() / logqx.shape[1]
else:
loss_kl = torch.distributions.kl_divergence(q_dist, self.prior_dist)
loss_kl = (loss_kl * nonpadding_sqz).sum() / nonpadding_sqz.sum() / z_q.shape[1]
z_p = None
return z_q, loss_kl, z_p, m_q, logs_q
else:
latent_shape = [cond_sqz.shape[0], self.latent_size, cond_sqz.shape[2]]
z_p = torch.randn(latent_shape).to(cond.device) * noise_scale
if self.use_prior_flow:
z_p = self.prior_flow(z_p, 1, cond_sqz, reverse=True)
return z_p

View File

@@ -1,230 +0,0 @@
import math
import torch
from torch import nn
from torch.nn import Linear
from modules.commons.conv import ConvBlocks, ConditionalConvBlocks
from modules.commons.common_layers import Embedding
from modules.commons.rel_transformer import RelTransformerEncoder
from modules.commons.transformer import MultiheadAttention, FFTBlocks
from modules.commons.align_ops import clip_mel2token_to_multiple, build_word_mask, expand_states, mel2ph_to_mel2word
from modules.portaspeech.fs import FS_DECODERS, FastSpeech
from modules.portaspeech.fvae import FVAE
from utils.tts_utils import group_hidden_by_segs
from utils.hparams import hparams
class SinusoidalPosEmb(nn.Module):
def __init__(self, dim):
super().__init__()
self.dim = dim
def forward(self, x):
"""
:param x: [B, T]
:return: [B, T, H]
"""
device = x.device
half_dim = self.dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
emb = x[:, :, None] * emb[None, :]
emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
return emb
class PortaSpeech(FastSpeech):
def __init__(self, ph_dictionary, word_dictionary, out_dims=None):
super().__init__(ph_dictionary, out_dims)
# build linguistic encoder
if hparams['use_word_encoder']:
# default False, use independent word embedding instead of phoneme encoding to represent word
self.word_encoder = RelTransformerEncoder(
len(word_dictionary), self.hidden_size, self.hidden_size, self.hidden_size, 2,
hparams['word_enc_layers'], hparams['enc_ffn_kernel_size'])
if hparams['dur_level'] == 'word':
if hparams['word_encoder_type'] == 'rel_fft':
self.ph2word_encoder = RelTransformerEncoder(
0, self.hidden_size, self.hidden_size, self.hidden_size, 2,
hparams['word_enc_layers'], hparams['enc_ffn_kernel_size'])
if hparams['word_encoder_type'] == 'fft':
self.ph2word_encoder = FFTBlocks(
self.hidden_size, hparams['word_enc_layers'], 1, num_heads=hparams['num_heads'])
self.sin_pos = SinusoidalPosEmb(self.hidden_size)
self.enc_pos_proj = nn.Linear(2 * self.hidden_size, self.hidden_size)
self.dec_query_proj = nn.Linear(2 * self.hidden_size, self.hidden_size)
self.dec_res_proj = nn.Linear(2 * self.hidden_size, self.hidden_size)
self.attn = MultiheadAttention(self.hidden_size, 1, encoder_decoder_attention=True, bias=False)
self.attn.enable_torch_version = False
if hparams['text_encoder_postnet']:
self.text_encoder_postnet = ConvBlocks(
self.hidden_size, self.hidden_size, [1] * 3, 5, layers_in_block=2)
else:
self.sin_pos = SinusoidalPosEmb(self.hidden_size)
# build VAE decoder
if hparams['use_fvae']:
del self.decoder
del self.mel_out
self.fvae = FVAE(
c_in_out=self.out_dims,
hidden_size=hparams['fvae_enc_dec_hidden'], c_latent=hparams['latent_size'],
kernel_size=hparams['fvae_kernel_size'],
enc_n_layers=hparams['fvae_enc_n_layers'],
dec_n_layers=hparams['fvae_dec_n_layers'],
c_cond=self.hidden_size,
use_prior_flow=hparams['use_prior_flow'],
flow_hidden=hparams['prior_flow_hidden'],
flow_kernel_size=hparams['prior_flow_kernel_size'],
flow_n_steps=hparams['prior_flow_n_blocks'],
strides=[hparams['fvae_strides']],
encoder_type=hparams['fvae_encoder_type'],
decoder_type=hparams['fvae_decoder_type'],
)
else:
self.decoder = FS_DECODERS[hparams['decoder_type']](hparams)
self.mel_out = Linear(self.hidden_size, self.out_dims, bias=True)
if hparams['use_pitch_embed']:
self.pitch_embed = Embedding(300, self.hidden_size, 0)
if hparams['add_word_pos']:
self.word_pos_proj = Linear(self.hidden_size, self.hidden_size)
def build_embedding(self, dictionary, embed_dim):
num_embeddings = len(dictionary)
emb = Embedding(num_embeddings, embed_dim, self.padding_idx)
return emb
def forward(self, txt_tokens, word_tokens, ph2word, word_len, mel2word=None, mel2ph=None,
spk_embed=None, spk_id=None, pitch=None, infer=False, tgt_mels=None,
global_step=None, *args, **kwargs):
ret = {}
style_embed = self.forward_style_embed(spk_embed, spk_id)
x, tgt_nonpadding = self.run_text_encoder(
txt_tokens, word_tokens, ph2word, word_len, mel2word, mel2ph, style_embed, ret, **kwargs)
x = x * tgt_nonpadding
ret['nonpadding'] = tgt_nonpadding
if hparams['use_pitch_embed']:
x = x + self.pitch_embed(pitch)
ret['decoder_inp'] = x
ret['mel_out_fvae'] = ret['mel_out'] = self.run_decoder(x, tgt_nonpadding, ret, infer, tgt_mels, global_step)
return ret
def run_text_encoder(self, txt_tokens, word_tokens, ph2word, word_len, mel2word, mel2ph, style_embed, ret, **kwargs):
word2word = torch.arange(word_len)[None, :].to(ph2word.device) + 1 # [B, T_mel, T_word]
src_nonpadding = (txt_tokens > 0).float()[:, :, None]
use_bert = hparams.get("use_bert") is True
if use_bert:
ph_encoder_out = self.ph_encoder(txt_tokens, bert_feats=kwargs['bert_feats'], ph2word=ph2word,
graph_lst=kwargs['graph_lst'], etypes_lst=kwargs['etypes_lst'],
cl_feats=kwargs['cl_feats'], ret=ret) * src_nonpadding + style_embed
else:
ph_encoder_out = self.ph_encoder(txt_tokens) * src_nonpadding + style_embed
if hparams['use_word_encoder']:
word_encoder_out = self.word_encoder(word_tokens) + style_embed
ph_encoder_out = ph_encoder_out + expand_states(word_encoder_out, ph2word)
if hparams['dur_level'] == 'word':
word_encoder_out = 0
h_ph_gb_word = group_hidden_by_segs(ph_encoder_out, ph2word, word_len)[0]
word_encoder_out = word_encoder_out + self.ph2word_encoder(h_ph_gb_word)
if hparams['use_word_encoder']:
word_encoder_out = word_encoder_out + self.word_encoder(word_tokens)
mel2word = self.forward_dur(ph_encoder_out, mel2word, ret, ph2word=ph2word, word_len=word_len)
mel2word = clip_mel2token_to_multiple(mel2word, hparams['frames_multiple'])
tgt_nonpadding = (mel2word > 0).float()[:, :, None]
enc_pos = self.get_pos_embed(word2word, ph2word) # [B, T_ph, H]
dec_pos = self.get_pos_embed(word2word, mel2word) # [B, T_mel, H]
dec_word_mask = build_word_mask(mel2word, ph2word) # [B, T_mel, T_ph]
x, weight = self.attention(ph_encoder_out, enc_pos, word_encoder_out, dec_pos, mel2word, dec_word_mask)
if hparams['add_word_pos']:
x = x + self.word_pos_proj(dec_pos)
ret['attn'] = weight
else:
mel2ph = self.forward_dur(ph_encoder_out, mel2ph, ret)
mel2ph = clip_mel2token_to_multiple(mel2ph, hparams['frames_multiple'])
mel2word = mel2ph_to_mel2word(mel2ph, ph2word)
x = expand_states(ph_encoder_out, mel2ph)
if hparams['add_word_pos']:
dec_pos = self.get_pos_embed(word2word, mel2word) # [B, T_mel, H]
x = x + self.word_pos_proj(dec_pos)
tgt_nonpadding = (mel2ph > 0).float()[:, :, None]
if hparams['use_word_encoder']:
x = x + expand_states(word_encoder_out, mel2word)
return x, tgt_nonpadding
def attention(self, ph_encoder_out, enc_pos, word_encoder_out, dec_pos, mel2word, dec_word_mask):
ph_kv = self.enc_pos_proj(torch.cat([ph_encoder_out, enc_pos], -1))
word_enc_out_expend = expand_states(word_encoder_out, mel2word)
word_enc_out_expend = torch.cat([word_enc_out_expend, dec_pos], -1)
if hparams['text_encoder_postnet']:
word_enc_out_expend = self.dec_res_proj(word_enc_out_expend)
word_enc_out_expend = self.text_encoder_postnet(word_enc_out_expend)
dec_q = x_res = word_enc_out_expend
else:
dec_q = self.dec_query_proj(word_enc_out_expend)
x_res = self.dec_res_proj(word_enc_out_expend)
ph_kv, dec_q = ph_kv.transpose(0, 1), dec_q.transpose(0, 1)
x, (weight, _) = self.attn(dec_q, ph_kv, ph_kv, attn_mask=(1 - dec_word_mask) * -1e9)
x = x.transpose(0, 1)
x = x + x_res
return x, weight
def run_decoder(self, x, tgt_nonpadding, ret, infer, tgt_mels=None, global_step=0):
if not hparams['use_fvae']:
x = self.decoder(x)
x = self.mel_out(x)
ret['kl'] = 0
return x * tgt_nonpadding
else:
decoder_inp = x
x = x.transpose(1, 2) # [B, H, T]
tgt_nonpadding_BHT = tgt_nonpadding.transpose(1, 2) # [B, H, T]
if infer:
z = self.fvae(cond=x, infer=True)
else:
tgt_mels = tgt_mels.transpose(1, 2) # [B, 80, T]
z, ret['kl'], ret['z_p'], ret['m_q'], ret['logs_q'] = self.fvae(
tgt_mels, tgt_nonpadding_BHT, cond=x)
if global_step < hparams['posterior_start_steps']:
z = torch.randn_like(z)
x_recon = self.fvae.decoder(z, nonpadding=tgt_nonpadding_BHT, cond=x).transpose(1, 2)
ret['pre_mel_out'] = x_recon
return x_recon
def forward_dur(self, dur_input, mel2word, ret, **kwargs):
"""
:param dur_input: [B, T_txt, H]
:param mel2ph: [B, T_mel]
:param txt_tokens: [B, T_txt]
:param ret:
:return:
"""
src_padding = dur_input.data.abs().sum(-1) == 0
dur_input = dur_input.detach() + hparams['predictor_grad'] * (dur_input - dur_input.detach())
dur = self.dur_predictor(dur_input, src_padding)
if hparams['dur_level'] == 'word':
word_len = kwargs['word_len']
ph2word = kwargs['ph2word']
B, T_ph = ph2word.shape
dur = torch.zeros([B, word_len.max() + 1]).to(ph2word.device).scatter_add(1, ph2word, dur)
dur = dur[:, 1:]
ret['dur'] = dur
if mel2word is None:
mel2word = self.length_regulator(dur).detach()
return mel2word
def get_pos_embed(self, word2word, x2word):
x_pos = build_word_mask(word2word, x2word).float() # [B, T_word, T_ph]
x_pos = (x_pos.cumsum(-1) / x_pos.sum(-1).clamp(min=1)[..., None] * x_pos).sum(1)
x_pos = self.sin_pos(x_pos.float()) # [B, T_ph, H]
return x_pos
def store_inverse_all(self):
def remove_weight_norm(m):
try:
if hasattr(m, 'store_inverse'):
m.store_inverse()
nn.utils.remove_weight_norm(m)
except ValueError: # this module didn't have weight norm
return
self.apply(remove_weight_norm)

View File

@@ -1,75 +0,0 @@
import torch
import torch.distributions as dist
from torch import nn
from modules.commons.normalizing_flow.glow_modules import Glow
from modules.portaspeech.portaspeech import PortaSpeech
from utils.hparams import hparams
class PortaSpeechFlow(PortaSpeech):
def __init__(self, ph_dict_size, word_dict_size, out_dims=None):
super().__init__(ph_dict_size, word_dict_size, out_dims)
cond_hs = 80
if hparams.get('use_txt_cond', True):
cond_hs = cond_hs + hparams['hidden_size']
if hparams.get('use_latent_cond', False):
cond_hs = cond_hs + hparams['latent_size']
if hparams['use_cond_proj']:
self.g_proj = nn.Conv1d(cond_hs, 160, 5, padding=2)
cond_hs = 160
self.post_flow = Glow(
80, hparams['post_glow_hidden'], hparams['post_glow_kernel_size'], 1,
hparams['post_glow_n_blocks'], hparams['post_glow_n_block_layers'],
n_split=4, n_sqz=2,
gin_channels=cond_hs,
share_cond_layers=hparams['post_share_cond_layers'],
share_wn_layers=hparams['share_wn_layers'],
sigmoid_scale=hparams['sigmoid_scale']
)
self.prior_dist = dist.Normal(0, 1)
def forward(self, txt_tokens, word_tokens, ph2word, word_len, mel2word=None, mel2ph=None,
spk_embed=None, spk_id=None, pitch=None, infer=False, tgt_mels=None,
forward_post_glow=True, two_stage=True, global_step=None, **kwargs):
is_training = self.training
train_fvae = not (forward_post_glow and two_stage)
if not train_fvae:
self.eval()
with torch.set_grad_enabled(mode=train_fvae):
ret = super(PortaSpeechFlow, self).forward(
txt_tokens, word_tokens, ph2word, word_len, mel2word, mel2ph,
spk_embed, spk_id, pitch, infer, tgt_mels, global_step, **kwargs)
if (forward_post_glow or not two_stage) and hparams['use_post_flow']:
self.run_post_glow(tgt_mels, infer, is_training, ret)
return ret
def run_post_glow(self, tgt_mels, infer, is_training, ret):
x_recon = ret['mel_out'].transpose(1, 2)
g = x_recon
B, _, T = g.shape
if hparams.get('use_txt_cond', True):
g = torch.cat([g, ret['decoder_inp'].transpose(1, 2)], 1)
if hparams.get('use_latent_cond', False):
g_z = ret['z_p'][:, :, :, None].repeat(1, 1, 1, 4).reshape(B, -1, T)
g = torch.cat([g, g_z], 1)
if hparams['use_cond_proj']:
g = self.g_proj(g)
prior_dist = self.prior_dist
if not infer:
if is_training:
self.post_flow.train()
nonpadding = ret['nonpadding'].transpose(1, 2)
y_lengths = nonpadding.sum(-1)
if hparams['detach_postflow_input']:
g = g.detach()
tgt_mels = tgt_mels.transpose(1, 2)
z_postflow, ldj = self.post_flow(tgt_mels, nonpadding, g=g)
ldj = ldj / y_lengths / 80
ret['z_pf'], ret['ldj_pf'] = z_postflow, ldj
ret['postflow'] = -prior_dist.log_prob(z_postflow).mean() - ldj.mean()
if torch.isnan(ret['postflow']):
ret['postflow'] = None
else:
nonpadding = torch.ones_like(x_recon[:, :1, :])
z_post = torch.randn(x_recon.shape).to(g.device) * hparams['noise_scale']
x_recon, _ = self.post_flow(z_post, nonpadding, g, reverse=True)
ret['mel_out'] = x_recon.transpose(1, 2)

View File

@@ -2,23 +2,26 @@
**AudioGPT** connects ChatGPT and a series of Audio Foundation Models to enable **sending** and **receiving** speech, sing, audio, and talking head during chatting.
<a src="https://img.shields.io/badge/%F0%9F%A4%97-Open%20in%20Spaces-blue" href="https://huggingface.co/spaces/AIGC-Audio/AudioGPT">
<img src="https://img.shields.io/badge/%F0%9F%A4%97-Open%20in%20Spaces-blue" alt="Open in Spaces">
</a>
## Capabilities
Up-to-date link: https://93868c7fa583f4b5.gradio.app
Up-to-date link: https://cdb7b543afd1c8e8.gradio.app
Here we list the capability of AudioGPT at this time. More supported models and tasks are comming soon. For prompt examples, refer to [asset](assets/README.md).
### Speech
| Task | Supported Foundation Models | Status |
|:-------------------------:|:-------------------------------:|:------:|
| Text-to-Speech | [FastSpeech](), [SyntaSpeech](), [VITS]() | Yes (WIP) |
| Style Transfer | [GenerSpeech]() | Yes |
| Speech Recognition | [whisper](), [Conformer]() | Yes |
| Speech Enhancement | [ConvTasNet]() | WIP |
| Speech Separation | [TF-GridNet]() | WIP |
| Speech Translation | [Multi-decoder]() | WIP |
| Mono-to-Binaural Speech | [NeuralWarp]() | Yes |
| Task | Supported Foundation Models | Status |
|:--------------------------:|:-------------------------------:|:------:|
| Text-to-Speech | [FastSpeech](), [SyntaSpeech](), [VITS]() | Yes (WIP) |
| Style Transfer | [GenerSpeech]() | Yes |
| Speech Recognition | [whisper](), [Conformer]() | Yes |
| Speech Enhancement | [ConvTasNet]() | WIP |
| Speech Separation | [TF-GridNet]() | WIP |
| Speech Translation | [Multi-decoder]() | WIP |
| Mono-to-Binaural | [NeuralWarp]() | Yes |
### Sing
@@ -27,14 +30,14 @@ Here we list the capability of AudioGPT at this time. More supported models and
| Text-to-Sing | [DiffSinger](), [VISinger]() | Yes (WIP) |
### Audio
| Task | Supported Foundation Models | Status |
|:----------------:|:---------------------------:|:---------:|
| Text-to-Audio | [Make-An-Audio]() | Yes |
| Audio Inpainting | [Make-An-Audio]() | Yes |
| Image-to-Audio | [Make-An-Audio]() | Yes |
| Sound Detection | [Audio-transformer]() | Yes (WIP) |
| Target sound detection | [TSDNet]() | Yes (WIP) |
| Sound Extraction | [LASSNet]() | Yes (WIP) |
| Task | Supported Foundation Models | Status |
|:----------------------:|:---------------------------:|:------:|
| Text-to-Audio | [Make-An-Audio]() | Yes |
| Audio Inpainting | [Make-An-Audio]() | Yes |
| Image-to-Audio | [Make-An-Audio]() | Yes |
| Sound Detection | [Audio-transformer]() | Yes |
| Target Sound Detection | [TSDNet]() | Yes |
| Sound Extraction | [LASSNet]() | Yes |
### Talking Head
@@ -44,7 +47,8 @@ Here we list the capability of AudioGPT at this time. More supported models and
| Talking Head Synthesis | [GeneFace]() | Yes (WIP) |
## Internal Version Updates
4.3 Support Talking Head Synthesis\
4.6 Support Sound Extraction/Detection\
4.3 Support huggingface demo space\
4.1 Support Audio inpainting and clean codes\
3.27 Support Style Transfer/Talking head Synthesis\
3.23 Support Text-to-Sing\
@@ -54,10 +58,9 @@ Here we list the capability of AudioGPT at this time. More supported models and
## Todo
- [x] clean text to sing/speech code
- [ ] import Espnet models for speech tasks
- [ ] merge talking head synthesis into main
- [x] change audio/video log output
- [ ] support huggingface space
- [x] support huggingface space
## Acknowledgement
We appreciate the open source of the following projects:

View File

@@ -66,13 +66,8 @@ Input Example : Please tell me the text description of this audio.<br />
Output:<br />
![](a2i.png)<br />
## Image
### Text-To-Image
Input Example : Generate an image of a horse<br />
Output:<br />
![](t2i.png)<br />
## Sound Detection
### Sound Detection
First upload your audio(.wav)<br />
Audio Example :<br />
<audio src="mix.wav" controls></audio><br />
@@ -80,21 +75,21 @@ Input Example : What events does this audio include?<br />
Output:<br />
![](detection.png)<br />
## Mono audio to Binaural Audio
### Mono audio to Binaural Audio
First upload your audio(.wav)<br />
<audio src="mix.wav" controls></audio><br />
Input Example: Transfer the mono speech to a binaural one audio.<br />
Output:<br />
![](m2b.png)<br />
## Target Sound Detection
### Target Sound Detection
Fisrt upload your audio(.wav)<br />
<audio src="mix.wav" controls></audio><br />
Input Example: please help me detect the target sound in the audio based on desription: “I want to detect Applause event”<br />
Output:<br />
![](tsd.png)<br />
## Sound Extraction
### Sound Extraction
First upload your audio(.wav)<br />
<audio src="mix.wav" controls></audio><br />
Input Example: Please help me extract the sound events from the audio based on the description: "a person shouts nearby and then emergency vehicle sirens sounds"<br />

View File

@@ -51,16 +51,16 @@ from target_sound_detection.src.models import event_labels
from target_sound_detection.src.utils import median_filter, decode_with_timestamps
import clip
import numpy as np
AUDIO_CHATGPT_PREFIX = """Audio ChatGPT
AUdio ChatGPT can not directly read audios, but it has a list of tools to finish different audio synthesis tasks. Each audio will have a file name formed as "audio/xxx.wav". When talking about audios, Audio ChatGPT is very strict to the file name and will never fabricate nonexistent files.
AUdio ChatGPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the audio content and audio file name. It will remember to provide the file name from the last tool observation, if a new audio is generated.
Human may provide Audio ChatGPT with a description. Audio ChatGPT should generate audios according to this description rather than directly imagine from memory or yourself."
AUDIO_CHATGPT_PREFIX = """AudioGPT
AudioGPT can not directly read audios, but it has a list of tools to finish different speech, audio, and singing voice tasks. Each audio will have a file name formed as "audio/xxx.wav". When talking about audios, AudioGPT is very strict to the file name and will never fabricate nonexistent files.
AudioGPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the audio content and audio file name. It will remember to provide the file name from the last tool observation, if a new audio is generated.
Human may provide new audios to AudioGPT with a description. The description helps AudioGPT to understand this audio, but AudioGPT should use tools to finish following tasks, rather than directly imagine from the description.
Overall, AudioGPT is a powerful audio dialogue assistant tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics.
TOOLS:
------
Audio ChatGPT has access to the following tools:"""
AudioGPT has access to the following tools:"""
AUDIO_CHATGPT_FORMAT_INSTRUCTIONS = """To use a tool, please use the following format:
@@ -238,7 +238,7 @@ class I2A:
image = Image.open(image)
image = self.sampler.model.cond_stage_model.preprocess(image).unsqueeze(0)
image_embedding = self.sampler.model.cond_stage_model.forward_img(image)
c = image_embedding.repeat(n_samples, 1, 1)# shape:[1,77,1280],即还没有变成句子embedding仍是每个单词的embedding
c = image_embedding.repeat(n_samples, 1, 1)
shape = [self.sampler.model.first_stage_model.embed_dim, H//8, W//8] # (z_dim, 80//2^x, 848//2^x)
samples_ddim, _ = self.sampler.sample(S=ddim_steps,
conditioning=c,
@@ -396,9 +396,9 @@ class Inpaint:
sr, ori_wav = wavfile.read(input_audio_path)
print("gen_mel")
print(sr,ori_wav.shape,ori_wav)
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0 # order='C'是以C语言格式存储不用管
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0
if len(ori_wav.shape)==2:# stereo
ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
ori_wav = librosa.to_mono(ori_wav.T)
print(sr,ori_wav.shape,ori_wav)
ori_wav = librosa.resample(ori_wav,orig_sr = sr,target_sr = SAMPLE_RATE)
@@ -417,9 +417,9 @@ class Inpaint:
print("gen_mel_audio")
print(sr,ori_wav.shape,ori_wav)
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0 # order='C'是以C语言格式存储不用管
ori_wav = ori_wav.astype(np.float32, order='C') / 32768.0
if len(ori_wav.shape)==2:# stereo
ori_wav = librosa.to_mono(ori_wav.T)# gradio load wav shape could be (wav_len,2) but librosa expects (2,wav_len)
ori_wav = librosa.to_mono(ori_wav.T)
print(sr,ori_wav.shape,ori_wav)
ori_wav = librosa.resample(ori_wav,orig_sr = sr,target_sr = SAMPLE_RATE)
@@ -432,7 +432,7 @@ class Inpaint:
mel = TRANSFORMS_16000(input_wav)
return mel
def show_mel_fn(self, input_audio_path):
crop_len = 500 # the full mel cannot be showed due to gradio's Image bug when using tool='sketch'
crop_len = 500
crop_mel = self.gen_mel(input_audio_path)[:,:crop_len]
color_mel = self.cmap_transform(crop_mel)
image = Image.fromarray((color_mel*255).astype(np.uint8))
@@ -473,11 +473,11 @@ class Inpaint:
torch.set_grad_enabled(False)
mel_img = Image.open(mel_and_mask['image'])
mask_img = Image.open(mel_and_mask["mask"])
show_mel = np.array(mel_img.convert("L"))/255 # 由于展示的mel只展示了一部分所以需要重新从音频生成mel
show_mel = np.array(mel_img.convert("L"))/255
mask = np.array(mask_img.convert("L"))/255
mel_bins,mel_len = 80,848
input_mel = self.gen_mel_audio(input_audio)[:,:mel_len]# 由于展示的mel只展示了一部分所以需要重新从音频生成mel
mask = np.pad(mask,((0,0),(0,mel_len-mask.shape[1])),mode='constant',constant_values=0)# 将mask填充到原来的mel的大小
input_mel = self.gen_mel_audio(input_audio)[:,:mel_len]
mask = np.pad(mask,((0,0),(0,mel_len-mask.shape[1])),mode='constant',constant_values=0)
print(mask.shape,input_mel.shape)
with torch.no_grad():
batch = self.make_batch_sd(input_mel,mask,num_samples=1)
@@ -781,7 +781,7 @@ class TargetSoundDetection:
class ConversationBot:
def __init__(self):
print("Initializing AudioChatGPT")
print("Initializing AudioGPT")
self.llm = OpenAI(temperature=0)
self.t2i = T2I(device="cuda:0")
self.i2t = ImageCaptioning(device="cuda:1")
@@ -820,19 +820,19 @@ class ConversationBot:
"Or Like: Generate a piece of singing voice. Text is xxx, note is xxx, duration is xxx."
"The input to this tool should be a comma seperated string of three, representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided."),
Tool(name="Synthesize Speech Given the User Input Text", func=self.tts.inference,
description="useful for when you want to convert a user input text into speech audio it saved it to a file."
description="useful for when you want to convert a user input text into speech and saved it to a file."
"The input to this tool should be a string, representing the text used to be converted to speech."),
Tool(name="Generate Audio From The Image", func=self.i2a.inference,
description="useful for when you want to generate an audio based on an image."
"The input to this tool should be a string, representing the image_path. "),
"The input to this tool should be a string, representing the image path. "),
Tool(name="Generate Text From The Audio", func=self.a2t.inference,
description="useful for when you want to describe an audio in text, receives audio_path as input."
"The input to this tool should be a string, representing the audio_path."),
description="useful for when you want to generate description of an audio or know what is inside the audio."
"The input to this tool should be a string, representing the audio path."),
Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
"The input to this tool should be a string, representing the audio_path."),
description="useful for when you want to inpaint or manipulate an audio, this tool receives audio path as input, "
"The input to this tool should be a string, representing the audio path."),
Tool(name="Transcribe speech", func=self.asr.inference,
description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
description="useful for when you want to know the content and transcription corresponding to a human speech, receives audio_path as input."
"The input to this tool should be a string, representing the audio_path."),
Tool(name="Detect the sound event from the audio", func=self.detection.inference,
description="useful for when you want to know what event in the audio and the sound event start or end time, receives audio_path as input. "
@@ -959,8 +959,8 @@ if __name__ == '__main__':
bot = ConversationBot()
with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
with gr.Row():
gr.Markdown("## Audio ChatGPT")
chatbot = gr.Chatbot(elem_id="chatbot", label="Audio ChatGPT")
gr.Markdown("## AudioGPT")
chatbot = gr.Chatbot(elem_id="chatbot", label="AudioGPT")
state = gr.State([])
with gr.Row():
with gr.Column(scale=0.7):

View File

@@ -32,7 +32,7 @@ wget -P text_to_speech/checkpoints/ljspeech/ps_adv_baseline -i https://huggingfa
wget -P audio_to_text/audiocaps_cntrstv_cnn14rnn_trm -i https://huggingface.co/AIGC-Audio/AudioGPT/blob/main/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/config.yaml https://huggingface.co/AIGC-Audio/AudioGPT/resolve/main/audio_to_text/audiocaps_cntrstv_cnn14rnn_trm/swa.pth
wget -P audio_to_text/clotho_cntrstv_cnn14rnn_trm -i https://huggingface.co/AIGC-Audio/AudioGPT/blob/main/audio_to_text/clotho_cntrstv_cnn14rnn_trm/config.yaml https://huggingface.co/AIGC-Audio/AudioGPT/resolve/main/audio_to_text/clotho_cntrstv_cnn14rnn_trm/swa.pth
wget -P audio_to_text/pretrained_feature_extractors https://huggingface.co/AIGC-Audio/AudioGPT/resolve/main/audio_to_text/pretrained_feature_extractors/contrastive_pretrain_cnn14_bertm.pth
# audio detection
# Audio detection
cd audio_detection/audio_infer/useful_ckpts
wget https://huggingface.co/Dongchao/pre_trained_model/resolve/main/audio_detection.pth
cd mono2binaural/useful_ckpts