mirror of
https://github.com/AIGC-Audio/AudioGPT.git
synced 2026-05-18 05:04:58 +02:00
update
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
task_cls: usr.task.DiffFsTask
|
||||
task_cls: tasks.svs.task.DiffFsTask
|
||||
pitch_type: frame
|
||||
timesteps: 100
|
||||
dilation_cycle_length: 1
|
||||
@@ -23,7 +23,7 @@ spec_max: [ -0.5982, -0.0778, 0.1205, 0.2747, 0.4657, 0.5123, 0.5684, 0.70
|
||||
0.0086, -0.0698, 0.1385, 0.0941, 0.1864, 0.1225, 0.2176, 0.2566,
|
||||
0.1670, 0.1007, 0.1444, 0.0888, 0.1998, 0.2414, 0.2932, 0.3047 ]
|
||||
|
||||
task_cls: usr.diffspeech_task.DiffSpeechTask
|
||||
task_cls: tasks.svs.diffspeech_task.DiffSpeechTask
|
||||
vocoder: vocoders.hifigan.HifiGAN
|
||||
vocoder_ckpt: checkpoints/0414_hifi_lj_1
|
||||
num_valid_plots: 10
|
||||
@@ -1,6 +1,6 @@
|
||||
base_config:
|
||||
- configs/singing/fs2.yaml
|
||||
- usr/configs/midi/cascade/opencs/opencpop_statis.yaml
|
||||
- egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
|
||||
|
||||
audio_sample_rate: 24000
|
||||
hop_size: 128 # Hop size.
|
||||
@@ -42,8 +42,8 @@ test_prefixes: [
|
||||
'2100',
|
||||
]
|
||||
|
||||
task_cls: usr.diffsinger_task.AuxDecoderMIDITask
|
||||
#vocoder: usr.singingvocoder.highgan.HighGAN
|
||||
task_cls: tasks.svs.diffsinger_task.AuxDecoderMIDITask
|
||||
#vocoder: tasks.svs.singingvocoder.highgan.HighGAN
|
||||
#vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl
|
||||
vocoder: vocoders.hifigan.HifiGAN
|
||||
vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
|
||||
@@ -1,6 +1,6 @@
|
||||
base_config:
|
||||
- usr/configs/popcs_ds_beta6.yaml
|
||||
- usr/configs/midi/cascade/opencs/opencpop_statis.yaml
|
||||
- egs/egs_bases/svs/popcs_ds_beta6.yaml
|
||||
- egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
|
||||
|
||||
binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
|
||||
binary_data_dir: 'data/binary/opencpop-midi-dp'
|
||||
@@ -21,7 +21,7 @@ pe_ckpt: ''
|
||||
|
||||
fs2_ckpt: 'checkpoints/0302_opencpop_fs_midi/model_ckpt_steps_160000.ckpt' #
|
||||
#num_valid_plots: 0
|
||||
task_cls: usr.diffsinger_task.DiffSingerMIDITask
|
||||
task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
|
||||
|
||||
K_step: 60
|
||||
max_tokens: 36000
|
||||
@@ -1,6 +1,6 @@
|
||||
base_config:
|
||||
- usr/configs/popcs_ds_beta6.yaml
|
||||
- usr/configs/midi/cascade/opencs/opencpop_statis.yaml
|
||||
- egs/egs_bases/svs/popcs_ds_beta6.yaml
|
||||
- egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
|
||||
|
||||
binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
|
||||
binary_data_dir: 'data/binary/opencpop-midi-dp'
|
||||
@@ -17,7 +17,7 @@ dur_predictor_layers: 5 # *
|
||||
|
||||
fs2_ckpt: '' #
|
||||
#num_valid_plots: 0
|
||||
task_cls: usr.diffsinger_task.DiffSingerMIDITask
|
||||
task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
|
||||
|
||||
timesteps: 1000
|
||||
K_step: 1000
|
||||
@@ -1,6 +1,6 @@
|
||||
base_config:
|
||||
- usr/configs/popcs_ds_beta6.yaml
|
||||
- usr/configs/midi/cascade/opencs/opencpop_statis.yaml
|
||||
- egs/egs_bases/svs/popcs_ds_beta6.yaml
|
||||
- egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
|
||||
|
||||
binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
|
||||
binary_data_dir: 'data/binary/opencpop-midi-dp'
|
||||
@@ -17,7 +17,7 @@ dur_predictor_layers: 5 # *
|
||||
|
||||
fs2_ckpt: '' #
|
||||
#num_valid_plots: 0
|
||||
task_cls: usr.diffsinger_task.DiffSingerMIDITask
|
||||
task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
|
||||
|
||||
# for diffusion schedule
|
||||
timesteps: 1000
|
||||
@@ -1,6 +1,6 @@
|
||||
base_config:
|
||||
- usr/configs/popcs_ds_beta6.yaml
|
||||
- usr/configs/midi/cascade/opencs/opencpop_statis.yaml
|
||||
- egs/egs_bases/svs/popcs_ds_beta6.yaml
|
||||
- egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
|
||||
|
||||
binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
|
||||
binary_data_dir: 'data/binary/opencpop-midi-dp'
|
||||
@@ -17,7 +17,7 @@ dur_predictor_layers: 5 # *
|
||||
|
||||
fs2_ckpt: '' #
|
||||
#num_valid_plots: 0
|
||||
task_cls: usr.diffsinger_task.DiffSingerMIDITask
|
||||
task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
|
||||
|
||||
K_step: 100
|
||||
max_tokens: 36000
|
||||
@@ -1,6 +1,6 @@
|
||||
base_config:
|
||||
- usr/configs/popcs_ds_beta6.yaml
|
||||
- usr/configs/midi/cascade/popcs/popcs_statis.yaml
|
||||
- egs/egs_bases/svs/popcs_ds_beta6.yaml
|
||||
- egs/egs_bases/svs/midi/cascade/popcs/popcs_statis.yaml
|
||||
|
||||
binarizer_cls: data_gen.singing.binarize.MidiSingingBinarizer
|
||||
binary_data_dir: 'data/binary/popcs-midi-dp'
|
||||
@@ -17,7 +17,7 @@ dur_predictor_layers: 5 # *
|
||||
|
||||
fs2_ckpt: '' #
|
||||
#num_valid_plots: 0
|
||||
task_cls: usr.diffsinger_task.DiffSingerMIDITask
|
||||
task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
|
||||
|
||||
K_step: 100
|
||||
max_tokens: 40000
|
||||
@@ -48,8 +48,8 @@ spec_max: [ 0.2645, 0.0583, -0.2344, -0.0184, 0.1227, 0.1533, 0.1103, 0.121
|
||||
-0.8770, -0.9520, -0.8749, -0.7297, -0.8374, -0.8667, -0.7157, -0.9035,
|
||||
-0.9219, -0.8801, -0.9298, -0.9009, -0.9604, -1.0537, -1.0781, -1.3766]
|
||||
|
||||
task_cls: usr.diffsinger_task.DiffSingerTask
|
||||
#vocoder: usr.singingvocoder.highgan.HighGAN
|
||||
task_cls: tasks.svs.diffsinger_task.DiffSingerTask
|
||||
#vocoder: tasks.svs.singingvocoder.highgan.HighGAN
|
||||
#vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl
|
||||
vocoder: vocoders.hifigan.HifiGAN
|
||||
vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
|
||||
@@ -3,7 +3,7 @@ base_config:
|
||||
|
||||
fs2_ckpt: checkpoints/popcs_fs2_pmf0_1230/model_ckpt_steps_160000.ckpt # to be infer
|
||||
num_valid_plots: 0
|
||||
task_cls: usr.diffsinger_task.DiffSingerOfflineTask
|
||||
task_cls: tasks.svs.diffsinger_task.DiffSingerOfflineTask
|
||||
|
||||
# tmp:
|
||||
#pe_enable: true
|
||||
@@ -26,7 +26,7 @@ test_prefixes: [
|
||||
]
|
||||
|
||||
task_cls: tasks.tts.fs2.FastSpeech2Task
|
||||
#vocoder: usr.singingvocoder.highgan.HighGAN
|
||||
#vocoder: tasks.svs.singingvocoder.highgan.HighGAN
|
||||
#vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl
|
||||
vocoder: vocoders.hifigan.HifiGAN
|
||||
vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
|
||||
@@ -2,8 +2,8 @@ import torch
|
||||
from inference.svs.base_svs_infer import BaseSVSInfer
|
||||
from utils import load_ckpt
|
||||
from utils.hparams import hparams
|
||||
from usr.diff.shallow_diffusion_tts import GaussianDiffusion
|
||||
from usr.diffsinger_task import DIFF_DECODERS
|
||||
from modulesmodules.diff.shallow_diffusion_tts import GaussianDiffusion
|
||||
from tasks.svs.diffsinger_task import DIFF_DECODERS
|
||||
|
||||
class DiffSingerCascadeInfer(BaseSVSInfer):
|
||||
def build_model(self):
|
||||
@@ -51,4 +51,4 @@ if __name__ == '__main__':
|
||||
} # input like Opencpop dataset.
|
||||
DiffSingerCascadeInfer.example_run(inp)
|
||||
|
||||
# # CUDA_VISIBLE_DEVICES=1 python inference/svs/ds_cascade.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi
|
||||
# # CUDA_VISIBLE_DEVICES=1 python inference/svs/ds_cascade.py --config egs/egs_bases/svs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi
|
||||
@@ -4,8 +4,8 @@ import torch
|
||||
from inference.svs.base_svs_infer import BaseSVSInfer
|
||||
from utils import load_ckpt
|
||||
from utils.hparams import hparams
|
||||
from usr.diff.shallow_diffusion_tts import GaussianDiffusion
|
||||
from usr.diffsinger_task import DIFF_DECODERS
|
||||
from modulesmodules.diff.shallow_diffusion_tts import GaussianDiffusion
|
||||
from tasks.svs.diffsinger_task import DIFF_DECODERS
|
||||
from modules.fastspeech.pe import PitchExtractor
|
||||
import utils
|
||||
|
||||
@@ -64,4 +64,4 @@ if __name__ == '__main__':
|
||||
DiffSingerE2EInfer.example_run(inp)
|
||||
|
||||
|
||||
# CUDA_VISIBLE_DEVICES=3 python inference/svs/ds_e2e.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name 0228_opencpop_ds100_rel
|
||||
# CUDA_VISIBLE_DEVICES=3 python inference/svs/ds_e2e.py --config egs/egs_bases/svs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name 0228_opencpop_ds100_rel
|
||||
@@ -1,27 +0,0 @@
|
||||
title: 'DiffSinger'
|
||||
description: |
|
||||
This model is trained on 5 hours single female singing voice samples of Opencpop dataset. (该模型在开源数据集Opencpop的5小时单人歌声上训练。)
|
||||
|
||||
Please assign pitch and duration values to each Chinese character. The corresponding pitch and duration value of each character should be separated by a | separator. It is necessary to ensure that the note window separated by the separator is consistent with the number of Chinese characters (AP or SP is also viewed as a Chinese character). (请给每个汉字分配音高和时值, 每个字对应的音高和时值需要用|分隔符隔开。需要保证分隔符分割出来的音符窗口与汉字个数(AP或SP也算一个汉字)一致。)
|
||||
|
||||
You can click one of the examples to load them. (你可以点击下方示例,加载示例曲谱。)
|
||||
|
||||
Note: This space is running on CPU. (该Demo是在Huggingface提供的CPU上运行的, 其推理速度在本地会更快一些。)
|
||||
|
||||
article: |
|
||||
Link to <a href='https://github.com/MoonInTheRiver/DiffSinger' style='color:blue;' target='_blank\'>Github REPO</a>
|
||||
example_inputs:
|
||||
- |-
|
||||
你 说 你 不 SP 懂 为 何 在 这 时 牵 手 AP<sep>D#4/Eb4 | D#4/Eb4 | D#4/Eb4 | D#4/Eb4 | rest | D#4/Eb4 | D4 | D4 | D4 | D#4/Eb4 | F4 | D#4/Eb4 | D4 | rest<sep>0.113740 | 0.329060 | 0.287950 | 0.133480 | 0.150900 | 0.484730 | 0.242010 | 0.180820 | 0.343570 | 0.152050 | 0.266720 | 0.280310 | 0.633300 | 0.444590
|
||||
- |-
|
||||
小酒窝长睫毛AP是你最美的记号<sep>C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4<sep>0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340
|
||||
- |-
|
||||
我真的SP爱你SP句句不轻易<sep>D4 | A4 | F#4 | rest | A4 | D4 | rest | B4 | A4 F#4 | F#4 | A4 | A4<sep>0.8 | 0.4 | 0.967 | 0.3 | 0.4 | 0.967 | 0.4 | 0.8 | 0.4 0.4 | 0.25 | 0.967 | 0.9
|
||||
- |-
|
||||
好冷啊 AP 我在东北玩泥巴<sep>F4 | F4 | D4 | rest | D4 | D4 | C4 | C4 | B3 | C4 | D4<sep>0.5 | 0.3 | 0.3 | 0.3 | 0.2 | 0.2 | 0.2 | 0.2 | 0.25 | 0.25 | 0.4
|
||||
|
||||
#inference_cls: inference.svs.ds_cascade.DiffSingerCascadeInfer
|
||||
#exp_name: 0303_opencpop_ds58_midi
|
||||
|
||||
inference_cls: inference.svs.ds_e2e.DiffSingerE2EInfer
|
||||
exp_name: 0831_opencpop_ds1000
|
||||
@@ -1,91 +0,0 @@
|
||||
import importlib
|
||||
import re
|
||||
|
||||
import gradio as gr
|
||||
import yaml
|
||||
from gradio.inputs import Textbox
|
||||
|
||||
from inference.svs.base_svs_infer import BaseSVSInfer
|
||||
from utils.hparams import set_hparams
|
||||
from utils.hparams import hparams as hp
|
||||
import numpy as np
|
||||
|
||||
|
||||
class GradioInfer:
|
||||
def __init__(self, exp_name, inference_cls, title, description, article, example_inputs):
|
||||
self.exp_name = exp_name
|
||||
self.title = title
|
||||
self.description = description
|
||||
self.article = article
|
||||
self.example_inputs = example_inputs
|
||||
pkg = ".".join(inference_cls.split(".")[:-1])
|
||||
cls_name = inference_cls.split(".")[-1]
|
||||
self.inference_cls = getattr(importlib.import_module(pkg), cls_name)
|
||||
|
||||
def greet(self, text, notes, notes_duration):
|
||||
PUNCS = '。?;:'
|
||||
sents = re.split(rf'([{PUNCS}])', text.replace('\n', ','))
|
||||
sents_notes = re.split(rf'([{PUNCS}])', notes.replace('\n', ','))
|
||||
sents_notes_dur = re.split(rf'([{PUNCS}])', notes_duration.replace('\n', ','))
|
||||
|
||||
if sents[-1] not in list(PUNCS):
|
||||
sents = sents + ['']
|
||||
sents_notes = sents_notes + ['']
|
||||
sents_notes_dur = sents_notes_dur + ['']
|
||||
|
||||
audio_outs = []
|
||||
s, n, n_dur = "", "", ""
|
||||
for i in range(0, len(sents), 2):
|
||||
if len(sents[i]) > 0:
|
||||
s += sents[i] + sents[i + 1]
|
||||
n += sents_notes[i] + sents_notes[i+1]
|
||||
n_dur += sents_notes_dur[i] + sents_notes_dur[i+1]
|
||||
if len(s) >= 400 or (i >= len(sents) - 2 and len(s) > 0):
|
||||
audio_out = self.infer_ins.infer_once({
|
||||
'text': s,
|
||||
'notes': n,
|
||||
'notes_duration': n_dur,
|
||||
})
|
||||
audio_out = audio_out * 32767
|
||||
audio_out = audio_out.astype(np.int16)
|
||||
audio_outs.append(audio_out)
|
||||
audio_outs.append(np.zeros(int(hp['audio_sample_rate'] * 0.3)).astype(np.int16))
|
||||
s = ""
|
||||
n = ""
|
||||
audio_outs = np.concatenate(audio_outs)
|
||||
return hp['audio_sample_rate'], audio_outs
|
||||
|
||||
def run(self):
|
||||
set_hparams(exp_name=self.exp_name, print_hparams=False)
|
||||
infer_cls = self.inference_cls
|
||||
self.infer_ins: BaseSVSInfer = infer_cls(hp)
|
||||
example_inputs = self.example_inputs
|
||||
for i in range(len(example_inputs)):
|
||||
text, notes, notes_dur = example_inputs[i].split('<sep>')
|
||||
example_inputs[i] = [text, notes, notes_dur]
|
||||
|
||||
iface = gr.Interface(fn=self.greet,
|
||||
inputs=[
|
||||
Textbox(lines=2, placeholder=None, default=example_inputs[0][0], label="input text"),
|
||||
Textbox(lines=2, placeholder=None, default=example_inputs[0][1], label="input note"),
|
||||
Textbox(lines=2, placeholder=None, default=example_inputs[0][2], label="input duration")]
|
||||
,
|
||||
outputs="audio",
|
||||
allow_flagging="never",
|
||||
title=self.title,
|
||||
description=self.description,
|
||||
article=self.article,
|
||||
examples=example_inputs,
|
||||
enable_queue=True)
|
||||
iface.launch(share=True,)# cache_examples=True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
gradio_config = yaml.safe_load(open('inference/svs/gradio/gradio_settings.yaml'))
|
||||
g = GradioInfer(**gradio_config)
|
||||
g.run()
|
||||
|
||||
|
||||
# python inference/svs/gradio/infer.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi
|
||||
# python inference/svs/ds_cascade.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi
|
||||
# CUDA_VISIBLE_DEVICES=3 python inference/svs/gradio/infer.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name 0228_opencpop_ds100_rel
|
||||
@@ -1,12 +1,15 @@
|
||||
import torch
|
||||
import os
|
||||
import importlib
|
||||
from inference.tts.base_tts_infer import BaseTTSInfer
|
||||
from utils.ckpt_utils import load_ckpt, get_last_checkpoint
|
||||
from modules.GenerSpeech.model.generspeech import GenerSpeech
|
||||
import os
|
||||
from data_gen.tts.emotion import inference as EmotionEncoder
|
||||
from data_gen.tts.emotion.inference import embed_utterance as Embed_utterance
|
||||
from data_gen.tts.emotion.inference import preprocess_wav
|
||||
|
||||
from data_gen.tts.data_gen_utils import is_sil_phoneme
|
||||
from resemblyzer import VoiceEncoder
|
||||
from utils import audio
|
||||
class GenerSpeechInfer(BaseTTSInfer):
|
||||
def build_model(self):
|
||||
model = GenerSpeech(self.ph_encoder)
|
||||
|
||||
@@ -1,6 +1,3 @@
|
||||
from data_gen.tts.data_gen_utils import is_sil_phoneme
|
||||
from resemblyzer import VoiceEncoder
|
||||
from data_gen.tts.data_gen_utils import build_phone_encoder, build_word_encoder
|
||||
from tasks.tts.dataset_utils import FastSpeechWordDataset
|
||||
from tasks.tts.tts_utils import load_data_preprocessor
|
||||
from vocoders.hifigan import HifiGanGenerator
|
||||
|
||||
@@ -1,13 +0,0 @@
|
||||
title: 'Rongjiehuang/GenerSpeech'
|
||||
description: |
|
||||
Gradio demo for Rongjiehuang/GenerSpeech. To use it, simply add your audio, or click one of the examples to load them.
|
||||
article: |
|
||||
Link to <a href='https://github.com/Rongjiehuang/GenerSpeech' style='color:blue;' target='_blank\'>Github REPO</a>
|
||||
example_inputs:
|
||||
- |-
|
||||
the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.
|
||||
- |-
|
||||
produced the block books, which were the immediate predecessors of the true printed book,
|
||||
inference_cls: inference.GenerSpeech.GenerSpeechInfer
|
||||
exp_name: GenerSpeech
|
||||
config: modules/GenerSpeech/config/prodiff_teacher.yaml
|
||||
@@ -1,72 +0,0 @@
|
||||
import importlib
|
||||
import re
|
||||
|
||||
import gradio as gr
|
||||
import yaml
|
||||
from gradio.inputs import Textbox, Audio
|
||||
|
||||
from inference.base_tts_infer import BaseTTSInfer
|
||||
from utils.hparams import set_hparams
|
||||
from utils.hparams import hparams as hp
|
||||
import numpy as np
|
||||
|
||||
from data_gen.tts.data_gen_utils import is_sil_phoneme, PUNCS
|
||||
|
||||
class GradioInfer:
|
||||
def __init__(self, exp_name, config, inference_cls, title, description, article, example_inputs):
|
||||
self.exp_name = exp_name
|
||||
self.config = config
|
||||
self.title = title
|
||||
self.description = description
|
||||
self.article = article
|
||||
self.example_inputs = example_inputs
|
||||
pkg = ".".join(inference_cls.split(".")[:-1])
|
||||
cls_name = inference_cls.split(".")[-1]
|
||||
self.inference_cls = getattr(importlib.import_module(pkg), cls_name)
|
||||
|
||||
def greet(self, text, audio):
|
||||
sents = re.split(rf'([{PUNCS}])', text.replace('\n', ','))
|
||||
if sents[-1] not in list(PUNCS):
|
||||
sents = sents + ['.']
|
||||
audio_outs = []
|
||||
s = ""
|
||||
for i in range(0, len(sents), 2):
|
||||
if len(sents[i]) > 0:
|
||||
s += sents[i] + sents[i + 1]
|
||||
if len(s) >= 400 or (i >= len(sents) - 2 and len(s) > 0):
|
||||
audio_out = self.infer_ins.infer_once({
|
||||
'text': s,
|
||||
'ref_audio': audio
|
||||
})
|
||||
audio_out = audio_out * 32767
|
||||
audio_out = audio_out.astype(np.int16)
|
||||
audio_outs.append(audio_out)
|
||||
audio_outs.append(np.zeros(int(hp['audio_sample_rate'] * 0.3)).astype(np.int16))
|
||||
s = ""
|
||||
audio_outs = np.concatenate(audio_outs)
|
||||
return hp['audio_sample_rate'], audio_outs
|
||||
|
||||
def run(self):
|
||||
set_hparams(exp_name=self.exp_name, config=self.config)
|
||||
infer_cls = self.inference_cls
|
||||
self.infer_ins: BaseTTSInfer = infer_cls(hp)
|
||||
example_inputs = self.example_inputs
|
||||
iface = gr.Interface(fn=self.greet,
|
||||
inputs=[
|
||||
Textbox(lines=10, placeholder=None, default=example_inputs[0], label="input text"),
|
||||
Audio(label="reference audio"),
|
||||
],
|
||||
outputs="audio",
|
||||
allow_flagging="never",
|
||||
title=self.title,
|
||||
description=self.description,
|
||||
article=self.article,
|
||||
examples=example_inputs,
|
||||
enable_queue=True)
|
||||
iface.launch()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
gradio_config = yaml.safe_load(open('inference/gradio/gradio_settings.yaml'))
|
||||
g = GradioInfer(**gradio_config)
|
||||
g.run()
|
||||
@@ -76,26 +76,6 @@ def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False)
|
||||
pass
|
||||
return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
|
||||
|
||||
class LayerNorm_(torch.nn.LayerNorm):
|
||||
"""Layer normalization module.
|
||||
:param int nout: output dim size
|
||||
:param int dim: dimension to be normalized
|
||||
"""
|
||||
|
||||
def __init__(self, nout, dim=-1, eps=1e-5):
|
||||
"""Construct an LayerNorm object."""
|
||||
super(LayerNorm_, self).__init__(nout, eps=eps)
|
||||
self.dim = dim
|
||||
|
||||
def forward(self, x):
|
||||
"""Apply layer normalization.
|
||||
:param torch.Tensor x: input tensor
|
||||
:return: layer normalized tensor
|
||||
:rtype torch.Tensor
|
||||
"""
|
||||
if self.dim == -1:
|
||||
return super(LayerNorm_, self).forward(x)
|
||||
return super(LayerNorm_, self).forward(x.transpose(1, -1)).transpose(1, -1)
|
||||
def Linear(in_features, out_features, bias=True):
|
||||
m = nn.Linear(in_features, out_features, bias)
|
||||
nn.init.xavier_uniform_(m.weight)
|
||||
|
||||
@@ -3,7 +3,8 @@ import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from modules.commons.common_layers import LayerNorm_, Embedding
|
||||
from modules.commons.common_layers import Embedding
|
||||
from modules.fastspeech.tts_modules import LayerNorm
|
||||
|
||||
|
||||
class LambdaLayer(nn.Module):
|
||||
@@ -35,7 +36,7 @@ class ResidualBlock(nn.Module):
|
||||
elif norm_type == 'gn':
|
||||
norm_builder = lambda: nn.GroupNorm(8, channels)
|
||||
elif norm_type == 'ln':
|
||||
norm_builder = lambda: LayerNorm_(channels, eps=ln_eps)
|
||||
norm_builder = lambda: LayerNorm(channels, dim=1, eps=ln_eps)
|
||||
else:
|
||||
norm_builder = lambda: nn.Identity()
|
||||
|
||||
@@ -89,7 +90,7 @@ class ConvBlocks(nn.Module):
|
||||
elif norm_type == 'gn':
|
||||
norm = nn.GroupNorm(8, hidden_size)
|
||||
elif norm_type == 'ln':
|
||||
norm = LayerNorm_(hidden_size, eps=ln_eps)
|
||||
norm = LayerNorm(hidden_size, dim=1, eps=ln_eps)
|
||||
self.last_norm = norm
|
||||
self.post_net1 = nn.Conv1d(hidden_size, out_dims, kernel_size=post_net_kernel,
|
||||
padding=post_net_kernel // 2)
|
||||
|
||||
@@ -3,8 +3,8 @@ import torch
|
||||
from torch import nn
|
||||
import torch.nn.functional as F
|
||||
from modules.commons.conv import TextConvEncoder, ConvBlocks
|
||||
from modules.commons.common_layers import Embedding, LayerNorm_
|
||||
from modules.fastspeech.tts_modules import PitchPredictor, LengthRegulator
|
||||
from modules.commons.common_layers import Embedding
|
||||
from modules.fastspeech.tts_modules import LayerNorm, PitchPredictor, LengthRegulator
|
||||
from modules.commons.rel_transformer import RelTransformerEncoder, BERTRelTransformerEncoder
|
||||
from modules.commons.align_ops import clip_mel2token_to_multiple, expand_states
|
||||
from utils.pitch_utils import denorm_f0, f0_to_coarse
|
||||
@@ -34,7 +34,7 @@ class DurationPredictor(torch.nn.Module):
|
||||
self.conv += [torch.nn.Sequential(
|
||||
torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2),
|
||||
torch.nn.ReLU(),
|
||||
LayerNorm_(n_chans, dim=1),
|
||||
LayerNorm(n_chans, dim=1),
|
||||
torch.nn.Dropout(dropout_rate)
|
||||
)]
|
||||
self.linear = nn.Sequential(torch.nn.Linear(n_chans, 1), nn.Softplus())
|
||||
|
||||
@@ -2,16 +2,16 @@ import torch
|
||||
|
||||
import utils
|
||||
from utils.hparams import hparams
|
||||
from .diff.net import DiffNet
|
||||
from .diff.shallow_diffusion_tts import GaussianDiffusion, OfflineGaussianDiffusion
|
||||
from .diffspeech_task import DiffSpeechTask
|
||||
from modules.diff.net import DiffNet
|
||||
from modules.diff.shallow_diffusion_tts import GaussianDiffusion, OfflineGaussianDiffusion
|
||||
from tasks.svs.diffspeech_task import DiffSpeechTask
|
||||
from vocoders.base_vocoder import get_vocoder_cls, BaseVocoder
|
||||
from modules.fastspeech.pe import PitchExtractor
|
||||
from modules.fastspeech.fs2 import FastSpeech2
|
||||
from modules.diffsinger_midi.fs2 import FastSpeech2MIDI
|
||||
from modules.fastspeech.tts_modules import mel2ph_to_dur
|
||||
|
||||
from usr.diff.candidate_decoder import FFT
|
||||
from modules.diff.candidate_decoder import FFT
|
||||
from utils.pitch_utils import denorm_f0
|
||||
from tasks.tts.fs2_utils import FastSpeechDataset
|
||||
from tasks.tts.fs2 import FastSpeech2Task
|
||||
@@ -2,9 +2,9 @@ import torch
|
||||
|
||||
import utils
|
||||
from utils.hparams import hparams
|
||||
from .diff.net import DiffNet
|
||||
from .diff.shallow_diffusion_tts import GaussianDiffusion
|
||||
from .task import DiffFsTask
|
||||
from modules.diff.net import DiffNet
|
||||
from modules.diff.shallow_diffusion_tts import GaussianDiffusion
|
||||
from tasks.svs.task import DiffFsTask
|
||||
from vocoders.base_vocoder import get_vocoder_cls, BaseVocoder
|
||||
from utils.pitch_utils import denorm_f0
|
||||
from tasks.tts.fs2_utils import FastSpeechDataset
|
||||
@@ -1,8 +1,8 @@
|
||||
import torch
|
||||
|
||||
import utils
|
||||
from .diff.diffusion import GaussianDiffusion
|
||||
from .diff.net import DiffNet
|
||||
from modules.diff.diffusion import GaussianDiffusion
|
||||
from modules.diff.net import DiffNet
|
||||
from tasks.tts.fs2 import FastSpeech2Task
|
||||
from utils.hparams import hparams
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use('Agg')
|
||||
|
||||
from utils import audio
|
||||
import matplotlib.pyplot as plt
|
||||
from data_gen.tts.data_gen_utils import get_pitch
|
||||
|
||||
@@ -38,7 +38,7 @@ from audio_to_text.inference_waveform import AudioCapModel
|
||||
import whisper
|
||||
from inference.svs.ds_e2e import DiffSingerE2EInfer
|
||||
from inference.tts.GenerSpeech import GenerSpeechInfer
|
||||
from inference.tts.SyntaSpeech import TTSInference
|
||||
from inference.tts.PortaSpeech import TTSInference
|
||||
from utils.hparams import set_hparams
|
||||
from utils.hparams import hparams as hp
|
||||
import scipy.io.wavfile as wavfile
|
||||
@@ -282,7 +282,7 @@ class T2S:
|
||||
print("Initializing DiffSinger to %s" % device)
|
||||
self.device = device
|
||||
self.exp_name = 'checkpoints/0831_opencpop_ds1000'
|
||||
self.config= 'NeuralSeq/usr/configs/midi/e2e/opencpop/ds1000.yaml'
|
||||
self.config= 'NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000.yaml'
|
||||
self.set_model_hparams()
|
||||
self.pipe = DiffSingerE2EInfer(self.hp, device)
|
||||
self.default_inp = {
|
||||
|
||||
Reference in New Issue
Block a user