diff --git a/NeuralSeq/usr/configs/base.yaml b/NeuralSeq/egs/egs_bases/svs/base.yaml similarity index 92% rename from NeuralSeq/usr/configs/base.yaml rename to NeuralSeq/egs/egs_bases/svs/base.yaml index f5c13e6..ec30695 100644 --- a/NeuralSeq/usr/configs/base.yaml +++ b/NeuralSeq/egs/egs_bases/svs/base.yaml @@ -1,4 +1,4 @@ -task_cls: usr.task.DiffFsTask +task_cls: tasks.svs.task.DiffFsTask pitch_type: frame timesteps: 100 dilation_cycle_length: 1 diff --git a/NeuralSeq/usr/configs/lj_ds_beta6.yaml b/NeuralSeq/egs/egs_bases/svs/lj_ds_beta6.yaml similarity index 97% rename from NeuralSeq/usr/configs/lj_ds_beta6.yaml rename to NeuralSeq/egs/egs_bases/svs/lj_ds_beta6.yaml index a0fcebb..7fd443c 100644 --- a/NeuralSeq/usr/configs/lj_ds_beta6.yaml +++ b/NeuralSeq/egs/egs_bases/svs/lj_ds_beta6.yaml @@ -23,7 +23,7 @@ spec_max: [ -0.5982, -0.0778, 0.1205, 0.2747, 0.4657, 0.5123, 0.5684, 0.70 0.0086, -0.0698, 0.1385, 0.0941, 0.1864, 0.1225, 0.2176, 0.2566, 0.1670, 0.1007, 0.1444, 0.0888, 0.1998, 0.2414, 0.2932, 0.3047 ] -task_cls: usr.diffspeech_task.DiffSpeechTask +task_cls: tasks.svs.diffspeech_task.DiffSpeechTask vocoder: vocoders.hifigan.HifiGAN vocoder_ckpt: checkpoints/0414_hifi_lj_1 num_valid_plots: 10 diff --git a/NeuralSeq/usr/configs/midi/cascade/opencs/aux_rel.yaml b/NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/aux_rel.yaml similarity index 87% rename from NeuralSeq/usr/configs/midi/cascade/opencs/aux_rel.yaml rename to NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/aux_rel.yaml index e8267c0..5c204cb 100644 --- a/NeuralSeq/usr/configs/midi/cascade/opencs/aux_rel.yaml +++ b/NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/aux_rel.yaml @@ -1,6 +1,6 @@ base_config: - configs/singing/fs2.yaml - - usr/configs/midi/cascade/opencs/opencpop_statis.yaml + - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml audio_sample_rate: 24000 hop_size: 128 # Hop size. @@ -42,8 +42,8 @@ test_prefixes: [ '2100', ] -task_cls: usr.diffsinger_task.AuxDecoderMIDITask -#vocoder: usr.singingvocoder.highgan.HighGAN +task_cls: tasks.svs.diffsinger_task.AuxDecoderMIDITask +#vocoder: tasks.svs.singingvocoder.highgan.HighGAN #vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl vocoder: vocoders.hifigan.HifiGAN vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128 diff --git a/NeuralSeq/usr/configs/midi/cascade/opencs/ds60_rel.yaml b/NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/ds60_rel.yaml similarity index 81% rename from NeuralSeq/usr/configs/midi/cascade/opencs/ds60_rel.yaml rename to NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/ds60_rel.yaml index 7f1b6bd..820a608 100644 --- a/NeuralSeq/usr/configs/midi/cascade/opencs/ds60_rel.yaml +++ b/NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/ds60_rel.yaml @@ -1,6 +1,6 @@ base_config: - - usr/configs/popcs_ds_beta6.yaml - - usr/configs/midi/cascade/opencs/opencpop_statis.yaml + - egs/egs_bases/svs/popcs_ds_beta6.yaml + - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer binary_data_dir: 'data/binary/opencpop-midi-dp' @@ -21,7 +21,7 @@ pe_ckpt: '' fs2_ckpt: 'checkpoints/0302_opencpop_fs_midi/model_ckpt_steps_160000.ckpt' # #num_valid_plots: 0 -task_cls: usr.diffsinger_task.DiffSingerMIDITask +task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask K_step: 60 max_tokens: 36000 diff --git a/NeuralSeq/usr/configs/midi/cascade/opencs/opencpop_statis.yaml b/NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml similarity index 100% rename from NeuralSeq/usr/configs/midi/cascade/opencs/opencpop_statis.yaml rename to NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml diff --git a/NeuralSeq/usr/configs/midi/e2e/opencpop/ds1000-10dil.yaml b/NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000-10dil.yaml similarity index 81% rename from NeuralSeq/usr/configs/midi/e2e/opencpop/ds1000-10dil.yaml rename to NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000-10dil.yaml index 4443ca7..8825430 100644 --- a/NeuralSeq/usr/configs/midi/e2e/opencpop/ds1000-10dil.yaml +++ b/NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000-10dil.yaml @@ -1,6 +1,6 @@ base_config: - - usr/configs/popcs_ds_beta6.yaml - - usr/configs/midi/cascade/opencs/opencpop_statis.yaml + - egs/egs_bases/svs/popcs_ds_beta6.yaml + - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer binary_data_dir: 'data/binary/opencpop-midi-dp' @@ -17,7 +17,7 @@ dur_predictor_layers: 5 # * fs2_ckpt: '' # #num_valid_plots: 0 -task_cls: usr.diffsinger_task.DiffSingerMIDITask +task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask timesteps: 1000 K_step: 1000 diff --git a/NeuralSeq/usr/configs/midi/e2e/opencpop/ds1000.yaml b/NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000.yaml similarity index 81% rename from NeuralSeq/usr/configs/midi/e2e/opencpop/ds1000.yaml rename to NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000.yaml index 1feb508..3c942cd 100644 --- a/NeuralSeq/usr/configs/midi/e2e/opencpop/ds1000.yaml +++ b/NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000.yaml @@ -1,6 +1,6 @@ base_config: - - usr/configs/popcs_ds_beta6.yaml - - usr/configs/midi/cascade/opencs/opencpop_statis.yaml + - egs/egs_bases/svs/popcs_ds_beta6.yaml + - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer binary_data_dir: 'data/binary/opencpop-midi-dp' @@ -17,7 +17,7 @@ dur_predictor_layers: 5 # * fs2_ckpt: '' # #num_valid_plots: 0 -task_cls: usr.diffsinger_task.DiffSingerMIDITask +task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask # for diffusion schedule timesteps: 1000 diff --git a/NeuralSeq/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml b/NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds100_adj_rel.yaml similarity index 80% rename from NeuralSeq/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml rename to NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds100_adj_rel.yaml index 5ff1b00..3cdda09 100644 --- a/NeuralSeq/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml +++ b/NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds100_adj_rel.yaml @@ -1,6 +1,6 @@ base_config: - - usr/configs/popcs_ds_beta6.yaml - - usr/configs/midi/cascade/opencs/opencpop_statis.yaml + - egs/egs_bases/svs/popcs_ds_beta6.yaml + - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer binary_data_dir: 'data/binary/opencpop-midi-dp' @@ -17,7 +17,7 @@ dur_predictor_layers: 5 # * fs2_ckpt: '' # #num_valid_plots: 0 -task_cls: usr.diffsinger_task.DiffSingerMIDITask +task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask K_step: 100 max_tokens: 36000 diff --git a/NeuralSeq/usr/configs/midi/e2e/popcs/ds100_adj_rel.yaml b/NeuralSeq/egs/egs_bases/svs/midi/e2e/popcs/ds100_adj_rel.yaml similarity index 80% rename from NeuralSeq/usr/configs/midi/e2e/popcs/ds100_adj_rel.yaml rename to NeuralSeq/egs/egs_bases/svs/midi/e2e/popcs/ds100_adj_rel.yaml index 7380923..b51b4f2 100644 --- a/NeuralSeq/usr/configs/midi/e2e/popcs/ds100_adj_rel.yaml +++ b/NeuralSeq/egs/egs_bases/svs/midi/e2e/popcs/ds100_adj_rel.yaml @@ -1,6 +1,6 @@ base_config: - - usr/configs/popcs_ds_beta6.yaml - - usr/configs/midi/cascade/popcs/popcs_statis.yaml + - egs/egs_bases/svs/popcs_ds_beta6.yaml + - egs/egs_bases/svs/midi/cascade/popcs/popcs_statis.yaml binarizer_cls: data_gen.singing.binarize.MidiSingingBinarizer binary_data_dir: 'data/binary/popcs-midi-dp' @@ -17,7 +17,7 @@ dur_predictor_layers: 5 # * fs2_ckpt: '' # #num_valid_plots: 0 -task_cls: usr.diffsinger_task.DiffSingerMIDITask +task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask K_step: 100 max_tokens: 40000 diff --git a/NeuralSeq/usr/configs/midi/pe.yaml b/NeuralSeq/egs/egs_bases/svs/midi/pe.yaml similarity index 100% rename from NeuralSeq/usr/configs/midi/pe.yaml rename to NeuralSeq/egs/egs_bases/svs/midi/pe.yaml diff --git a/NeuralSeq/usr/configs/popcs_ds_beta6.yaml b/NeuralSeq/egs/egs_bases/svs/popcs_ds_beta6.yaml similarity index 96% rename from NeuralSeq/usr/configs/popcs_ds_beta6.yaml rename to NeuralSeq/egs/egs_bases/svs/popcs_ds_beta6.yaml index 699e9f7..000c941 100644 --- a/NeuralSeq/usr/configs/popcs_ds_beta6.yaml +++ b/NeuralSeq/egs/egs_bases/svs/popcs_ds_beta6.yaml @@ -48,8 +48,8 @@ spec_max: [ 0.2645, 0.0583, -0.2344, -0.0184, 0.1227, 0.1533, 0.1103, 0.121 -0.8770, -0.9520, -0.8749, -0.7297, -0.8374, -0.8667, -0.7157, -0.9035, -0.9219, -0.8801, -0.9298, -0.9009, -0.9604, -1.0537, -1.0781, -1.3766] -task_cls: usr.diffsinger_task.DiffSingerTask -#vocoder: usr.singingvocoder.highgan.HighGAN +task_cls: tasks.svs.diffsinger_task.DiffSingerTask +#vocoder: tasks.svs.singingvocoder.highgan.HighGAN #vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl vocoder: vocoders.hifigan.HifiGAN vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128 diff --git a/NeuralSeq/usr/configs/popcs_ds_beta6_offline.yaml b/NeuralSeq/egs/egs_bases/svs/popcs_ds_beta6_offline.yaml similarity index 82% rename from NeuralSeq/usr/configs/popcs_ds_beta6_offline.yaml rename to NeuralSeq/egs/egs_bases/svs/popcs_ds_beta6_offline.yaml index 84d15d9..8628623 100644 --- a/NeuralSeq/usr/configs/popcs_ds_beta6_offline.yaml +++ b/NeuralSeq/egs/egs_bases/svs/popcs_ds_beta6_offline.yaml @@ -3,7 +3,7 @@ base_config: fs2_ckpt: checkpoints/popcs_fs2_pmf0_1230/model_ckpt_steps_160000.ckpt # to be infer num_valid_plots: 0 -task_cls: usr.diffsinger_task.DiffSingerOfflineTask +task_cls: tasks.svs.diffsinger_task.DiffSingerOfflineTask # tmp: #pe_enable: true diff --git a/NeuralSeq/usr/configs/popcs_fs2.yaml b/NeuralSeq/egs/egs_bases/svs/popcs_fs2.yaml similarity index 94% rename from NeuralSeq/usr/configs/popcs_fs2.yaml rename to NeuralSeq/egs/egs_bases/svs/popcs_fs2.yaml index a6b08e9..a9f1a7a 100644 --- a/NeuralSeq/usr/configs/popcs_fs2.yaml +++ b/NeuralSeq/egs/egs_bases/svs/popcs_fs2.yaml @@ -26,7 +26,7 @@ test_prefixes: [ ] task_cls: tasks.tts.fs2.FastSpeech2Task -#vocoder: usr.singingvocoder.highgan.HighGAN +#vocoder: tasks.svs.singingvocoder.highgan.HighGAN #vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl vocoder: vocoders.hifigan.HifiGAN vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128 diff --git a/NeuralSeq/inference/svs/ds_cascade.py b/NeuralSeq/inference/svs/ds_cascade.py index c62fdd4..f0ec5ed 100644 --- a/NeuralSeq/inference/svs/ds_cascade.py +++ b/NeuralSeq/inference/svs/ds_cascade.py @@ -2,8 +2,8 @@ import torch from inference.svs.base_svs_infer import BaseSVSInfer from utils import load_ckpt from utils.hparams import hparams -from usr.diff.shallow_diffusion_tts import GaussianDiffusion -from usr.diffsinger_task import DIFF_DECODERS +from modulesmodules.diff.shallow_diffusion_tts import GaussianDiffusion +from tasks.svs.diffsinger_task import DIFF_DECODERS class DiffSingerCascadeInfer(BaseSVSInfer): def build_model(self): @@ -51,4 +51,4 @@ if __name__ == '__main__': } # input like Opencpop dataset. DiffSingerCascadeInfer.example_run(inp) -# # CUDA_VISIBLE_DEVICES=1 python inference/svs/ds_cascade.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi \ No newline at end of file +# # CUDA_VISIBLE_DEVICES=1 python inference/svs/ds_cascade.py --config egs/egs_bases/svs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi \ No newline at end of file diff --git a/NeuralSeq/inference/svs/ds_e2e.py b/NeuralSeq/inference/svs/ds_e2e.py index 68590a8..3b2b9ad 100644 --- a/NeuralSeq/inference/svs/ds_e2e.py +++ b/NeuralSeq/inference/svs/ds_e2e.py @@ -4,8 +4,8 @@ import torch from inference.svs.base_svs_infer import BaseSVSInfer from utils import load_ckpt from utils.hparams import hparams -from usr.diff.shallow_diffusion_tts import GaussianDiffusion -from usr.diffsinger_task import DIFF_DECODERS +from modulesmodules.diff.shallow_diffusion_tts import GaussianDiffusion +from tasks.svs.diffsinger_task import DIFF_DECODERS from modules.fastspeech.pe import PitchExtractor import utils @@ -64,4 +64,4 @@ if __name__ == '__main__': DiffSingerE2EInfer.example_run(inp) -# CUDA_VISIBLE_DEVICES=3 python inference/svs/ds_e2e.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name 0228_opencpop_ds100_rel \ No newline at end of file +# CUDA_VISIBLE_DEVICES=3 python inference/svs/ds_e2e.py --config egs/egs_bases/svs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name 0228_opencpop_ds100_rel \ No newline at end of file diff --git a/NeuralSeq/inference/svs/gradio/gradio_settings.yaml b/NeuralSeq/inference/svs/gradio/gradio_settings.yaml deleted file mode 100644 index d0b3666..0000000 --- a/NeuralSeq/inference/svs/gradio/gradio_settings.yaml +++ /dev/null @@ -1,27 +0,0 @@ -title: 'DiffSinger' -description: | - This model is trained on 5 hours single female singing voice samples of Opencpop dataset. (该模型在开源数据集Opencpop的5小时单人歌声上训练。) - - Please assign pitch and duration values to each Chinese character. The corresponding pitch and duration value of each character should be separated by a | separator. It is necessary to ensure that the note window separated by the separator is consistent with the number of Chinese characters (AP or SP is also viewed as a Chinese character). (请给每个汉字分配音高和时值, 每个字对应的音高和时值需要用|分隔符隔开。需要保证分隔符分割出来的音符窗口与汉字个数(AP或SP也算一个汉字)一致。) - - You can click one of the examples to load them. (你可以点击下方示例,加载示例曲谱。) - - Note: This space is running on CPU. (该Demo是在Huggingface提供的CPU上运行的, 其推理速度在本地会更快一些。) - -article: | - Link to Github REPO -example_inputs: - - |- - 你 说 你 不 SP 懂 为 何 在 这 时 牵 手 APD#4/Eb4 | D#4/Eb4 | D#4/Eb4 | D#4/Eb4 | rest | D#4/Eb4 | D4 | D4 | D4 | D#4/Eb4 | F4 | D#4/Eb4 | D4 | rest0.113740 | 0.329060 | 0.287950 | 0.133480 | 0.150900 | 0.484730 | 0.242010 | 0.180820 | 0.343570 | 0.152050 | 0.266720 | 0.280310 | 0.633300 | 0.444590 - - |- - 小酒窝长睫毛AP是你最美的记号C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db40.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340 - - |- - 我真的SP爱你SP句句不轻易D4 | A4 | F#4 | rest | A4 | D4 | rest | B4 | A4 F#4 | F#4 | A4 | A40.8 | 0.4 | 0.967 | 0.3 | 0.4 | 0.967 | 0.4 | 0.8 | 0.4 0.4 | 0.25 | 0.967 | 0.9 - - |- - 好冷啊 AP 我在东北玩泥巴F4 | F4 | D4 | rest | D4 | D4 | C4 | C4 | B3 | C4 | D40.5 | 0.3 | 0.3 | 0.3 | 0.2 | 0.2 | 0.2 | 0.2 | 0.25 | 0.25 | 0.4 - -#inference_cls: inference.svs.ds_cascade.DiffSingerCascadeInfer -#exp_name: 0303_opencpop_ds58_midi - -inference_cls: inference.svs.ds_e2e.DiffSingerE2EInfer -exp_name: 0831_opencpop_ds1000 \ No newline at end of file diff --git a/NeuralSeq/inference/svs/gradio/infer.py b/NeuralSeq/inference/svs/gradio/infer.py deleted file mode 100644 index b8de498..0000000 --- a/NeuralSeq/inference/svs/gradio/infer.py +++ /dev/null @@ -1,91 +0,0 @@ -import importlib -import re - -import gradio as gr -import yaml -from gradio.inputs import Textbox - -from inference.svs.base_svs_infer import BaseSVSInfer -from utils.hparams import set_hparams -from utils.hparams import hparams as hp -import numpy as np - - -class GradioInfer: - def __init__(self, exp_name, inference_cls, title, description, article, example_inputs): - self.exp_name = exp_name - self.title = title - self.description = description - self.article = article - self.example_inputs = example_inputs - pkg = ".".join(inference_cls.split(".")[:-1]) - cls_name = inference_cls.split(".")[-1] - self.inference_cls = getattr(importlib.import_module(pkg), cls_name) - - def greet(self, text, notes, notes_duration): - PUNCS = '。?;:' - sents = re.split(rf'([{PUNCS}])', text.replace('\n', ',')) - sents_notes = re.split(rf'([{PUNCS}])', notes.replace('\n', ',')) - sents_notes_dur = re.split(rf'([{PUNCS}])', notes_duration.replace('\n', ',')) - - if sents[-1] not in list(PUNCS): - sents = sents + [''] - sents_notes = sents_notes + [''] - sents_notes_dur = sents_notes_dur + [''] - - audio_outs = [] - s, n, n_dur = "", "", "" - for i in range(0, len(sents), 2): - if len(sents[i]) > 0: - s += sents[i] + sents[i + 1] - n += sents_notes[i] + sents_notes[i+1] - n_dur += sents_notes_dur[i] + sents_notes_dur[i+1] - if len(s) >= 400 or (i >= len(sents) - 2 and len(s) > 0): - audio_out = self.infer_ins.infer_once({ - 'text': s, - 'notes': n, - 'notes_duration': n_dur, - }) - audio_out = audio_out * 32767 - audio_out = audio_out.astype(np.int16) - audio_outs.append(audio_out) - audio_outs.append(np.zeros(int(hp['audio_sample_rate'] * 0.3)).astype(np.int16)) - s = "" - n = "" - audio_outs = np.concatenate(audio_outs) - return hp['audio_sample_rate'], audio_outs - - def run(self): - set_hparams(exp_name=self.exp_name, print_hparams=False) - infer_cls = self.inference_cls - self.infer_ins: BaseSVSInfer = infer_cls(hp) - example_inputs = self.example_inputs - for i in range(len(example_inputs)): - text, notes, notes_dur = example_inputs[i].split('') - example_inputs[i] = [text, notes, notes_dur] - - iface = gr.Interface(fn=self.greet, - inputs=[ - Textbox(lines=2, placeholder=None, default=example_inputs[0][0], label="input text"), - Textbox(lines=2, placeholder=None, default=example_inputs[0][1], label="input note"), - Textbox(lines=2, placeholder=None, default=example_inputs[0][2], label="input duration")] - , - outputs="audio", - allow_flagging="never", - title=self.title, - description=self.description, - article=self.article, - examples=example_inputs, - enable_queue=True) - iface.launch(share=True,)# cache_examples=True) - - -if __name__ == '__main__': - gradio_config = yaml.safe_load(open('inference/svs/gradio/gradio_settings.yaml')) - g = GradioInfer(**gradio_config) - g.run() - - -# python inference/svs/gradio/infer.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi -# python inference/svs/ds_cascade.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi -# CUDA_VISIBLE_DEVICES=3 python inference/svs/gradio/infer.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name 0228_opencpop_ds100_rel \ No newline at end of file diff --git a/NeuralSeq/inference/tts/GenerSpeech.py b/NeuralSeq/inference/tts/GenerSpeech.py index 689fe20..5052733 100644 --- a/NeuralSeq/inference/tts/GenerSpeech.py +++ b/NeuralSeq/inference/tts/GenerSpeech.py @@ -1,12 +1,15 @@ import torch +import os +import importlib from inference.tts.base_tts_infer import BaseTTSInfer from utils.ckpt_utils import load_ckpt, get_last_checkpoint from modules.GenerSpeech.model.generspeech import GenerSpeech -import os from data_gen.tts.emotion import inference as EmotionEncoder from data_gen.tts.emotion.inference import embed_utterance as Embed_utterance from data_gen.tts.emotion.inference import preprocess_wav - +from data_gen.tts.data_gen_utils import is_sil_phoneme +from resemblyzer import VoiceEncoder +from utils import audio class GenerSpeechInfer(BaseTTSInfer): def build_model(self): model = GenerSpeech(self.ph_encoder) diff --git a/NeuralSeq/inference/tts/SyntaSpeech.py b/NeuralSeq/inference/tts/PortaSpeech.py similarity index 100% rename from NeuralSeq/inference/tts/SyntaSpeech.py rename to NeuralSeq/inference/tts/PortaSpeech.py diff --git a/NeuralSeq/inference/tts/base_tts_infer.py b/NeuralSeq/inference/tts/base_tts_infer.py index eb21517..6478f91 100644 --- a/NeuralSeq/inference/tts/base_tts_infer.py +++ b/NeuralSeq/inference/tts/base_tts_infer.py @@ -1,6 +1,3 @@ -from data_gen.tts.data_gen_utils import is_sil_phoneme -from resemblyzer import VoiceEncoder -from data_gen.tts.data_gen_utils import build_phone_encoder, build_word_encoder from tasks.tts.dataset_utils import FastSpeechWordDataset from tasks.tts.tts_utils import load_data_preprocessor from vocoders.hifigan import HifiGanGenerator diff --git a/NeuralSeq/inference/tts/gradio/gradio_settings.yaml b/NeuralSeq/inference/tts/gradio/gradio_settings.yaml deleted file mode 100644 index 778b670..0000000 --- a/NeuralSeq/inference/tts/gradio/gradio_settings.yaml +++ /dev/null @@ -1,13 +0,0 @@ -title: 'Rongjiehuang/GenerSpeech' -description: | - Gradio demo for Rongjiehuang/GenerSpeech. To use it, simply add your audio, or click one of the examples to load them. -article: | - Link to Github REPO -example_inputs: - - |- - the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing. - - |- - produced the block books, which were the immediate predecessors of the true printed book, -inference_cls: inference.GenerSpeech.GenerSpeechInfer -exp_name: GenerSpeech -config: modules/GenerSpeech/config/prodiff_teacher.yaml \ No newline at end of file diff --git a/NeuralSeq/inference/tts/gradio/infer.py b/NeuralSeq/inference/tts/gradio/infer.py deleted file mode 100644 index 2baff8d..0000000 --- a/NeuralSeq/inference/tts/gradio/infer.py +++ /dev/null @@ -1,72 +0,0 @@ -import importlib -import re - -import gradio as gr -import yaml -from gradio.inputs import Textbox, Audio - -from inference.base_tts_infer import BaseTTSInfer -from utils.hparams import set_hparams -from utils.hparams import hparams as hp -import numpy as np - -from data_gen.tts.data_gen_utils import is_sil_phoneme, PUNCS - -class GradioInfer: - def __init__(self, exp_name, config, inference_cls, title, description, article, example_inputs): - self.exp_name = exp_name - self.config = config - self.title = title - self.description = description - self.article = article - self.example_inputs = example_inputs - pkg = ".".join(inference_cls.split(".")[:-1]) - cls_name = inference_cls.split(".")[-1] - self.inference_cls = getattr(importlib.import_module(pkg), cls_name) - - def greet(self, text, audio): - sents = re.split(rf'([{PUNCS}])', text.replace('\n', ',')) - if sents[-1] not in list(PUNCS): - sents = sents + ['.'] - audio_outs = [] - s = "" - for i in range(0, len(sents), 2): - if len(sents[i]) > 0: - s += sents[i] + sents[i + 1] - if len(s) >= 400 or (i >= len(sents) - 2 and len(s) > 0): - audio_out = self.infer_ins.infer_once({ - 'text': s, - 'ref_audio': audio - }) - audio_out = audio_out * 32767 - audio_out = audio_out.astype(np.int16) - audio_outs.append(audio_out) - audio_outs.append(np.zeros(int(hp['audio_sample_rate'] * 0.3)).astype(np.int16)) - s = "" - audio_outs = np.concatenate(audio_outs) - return hp['audio_sample_rate'], audio_outs - - def run(self): - set_hparams(exp_name=self.exp_name, config=self.config) - infer_cls = self.inference_cls - self.infer_ins: BaseTTSInfer = infer_cls(hp) - example_inputs = self.example_inputs - iface = gr.Interface(fn=self.greet, - inputs=[ - Textbox(lines=10, placeholder=None, default=example_inputs[0], label="input text"), - Audio(label="reference audio"), - ], - outputs="audio", - allow_flagging="never", - title=self.title, - description=self.description, - article=self.article, - examples=example_inputs, - enable_queue=True) - iface.launch() - - -if __name__ == '__main__': - gradio_config = yaml.safe_load(open('inference/gradio/gradio_settings.yaml')) - g = GradioInfer(**gradio_config) - g.run() diff --git a/NeuralSeq/modules/commons/common_layers.py b/NeuralSeq/modules/commons/common_layers.py index 8020221..0657b0d 100644 --- a/NeuralSeq/modules/commons/common_layers.py +++ b/NeuralSeq/modules/commons/common_layers.py @@ -76,26 +76,6 @@ def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False) pass return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine) -class LayerNorm_(torch.nn.LayerNorm): - """Layer normalization module. - :param int nout: output dim size - :param int dim: dimension to be normalized - """ - - def __init__(self, nout, dim=-1, eps=1e-5): - """Construct an LayerNorm object.""" - super(LayerNorm_, self).__init__(nout, eps=eps) - self.dim = dim - - def forward(self, x): - """Apply layer normalization. - :param torch.Tensor x: input tensor - :return: layer normalized tensor - :rtype torch.Tensor - """ - if self.dim == -1: - return super(LayerNorm_, self).forward(x) - return super(LayerNorm_, self).forward(x.transpose(1, -1)).transpose(1, -1) def Linear(in_features, out_features, bias=True): m = nn.Linear(in_features, out_features, bias) nn.init.xavier_uniform_(m.weight) diff --git a/NeuralSeq/modules/commons/conv.py b/NeuralSeq/modules/commons/conv.py index f2d80ad..a86505f 100644 --- a/NeuralSeq/modules/commons/conv.py +++ b/NeuralSeq/modules/commons/conv.py @@ -3,7 +3,8 @@ import torch import torch.nn as nn import torch.nn.functional as F -from modules.commons.common_layers import LayerNorm_, Embedding +from modules.commons.common_layers import Embedding +from modules.fastspeech.tts_modules import LayerNorm class LambdaLayer(nn.Module): @@ -35,7 +36,7 @@ class ResidualBlock(nn.Module): elif norm_type == 'gn': norm_builder = lambda: nn.GroupNorm(8, channels) elif norm_type == 'ln': - norm_builder = lambda: LayerNorm_(channels, eps=ln_eps) + norm_builder = lambda: LayerNorm(channels, dim=1, eps=ln_eps) else: norm_builder = lambda: nn.Identity() @@ -89,7 +90,7 @@ class ConvBlocks(nn.Module): elif norm_type == 'gn': norm = nn.GroupNorm(8, hidden_size) elif norm_type == 'ln': - norm = LayerNorm_(hidden_size, eps=ln_eps) + norm = LayerNorm(hidden_size, dim=1, eps=ln_eps) self.last_norm = norm self.post_net1 = nn.Conv1d(hidden_size, out_dims, kernel_size=post_net_kernel, padding=post_net_kernel // 2) diff --git a/NeuralSeq/usr/diff/__pycache__/candidate_decoder.cpython-37.pyc b/NeuralSeq/modules/diff/__pycache__/candidate_decoder.cpython-37.pyc similarity index 100% rename from NeuralSeq/usr/diff/__pycache__/candidate_decoder.cpython-37.pyc rename to NeuralSeq/modules/diff/__pycache__/candidate_decoder.cpython-37.pyc diff --git a/NeuralSeq/usr/diff/__pycache__/candidate_decoder.cpython-38.pyc b/NeuralSeq/modules/diff/__pycache__/candidate_decoder.cpython-38.pyc similarity index 100% rename from NeuralSeq/usr/diff/__pycache__/candidate_decoder.cpython-38.pyc rename to NeuralSeq/modules/diff/__pycache__/candidate_decoder.cpython-38.pyc diff --git a/NeuralSeq/usr/diff/__pycache__/diffusion.cpython-37.pyc b/NeuralSeq/modules/diff/__pycache__/diffusion.cpython-37.pyc similarity index 100% rename from NeuralSeq/usr/diff/__pycache__/diffusion.cpython-37.pyc rename to NeuralSeq/modules/diff/__pycache__/diffusion.cpython-37.pyc diff --git a/NeuralSeq/usr/diff/__pycache__/diffusion.cpython-38.pyc b/NeuralSeq/modules/diff/__pycache__/diffusion.cpython-38.pyc similarity index 100% rename from NeuralSeq/usr/diff/__pycache__/diffusion.cpython-38.pyc rename to NeuralSeq/modules/diff/__pycache__/diffusion.cpython-38.pyc diff --git a/NeuralSeq/usr/diff/__pycache__/net.cpython-37.pyc b/NeuralSeq/modules/diff/__pycache__/net.cpython-37.pyc similarity index 100% rename from NeuralSeq/usr/diff/__pycache__/net.cpython-37.pyc rename to NeuralSeq/modules/diff/__pycache__/net.cpython-37.pyc diff --git a/NeuralSeq/usr/diff/__pycache__/net.cpython-38.pyc b/NeuralSeq/modules/diff/__pycache__/net.cpython-38.pyc similarity index 100% rename from NeuralSeq/usr/diff/__pycache__/net.cpython-38.pyc rename to NeuralSeq/modules/diff/__pycache__/net.cpython-38.pyc diff --git a/NeuralSeq/usr/diff/__pycache__/shallow_diffusion_tts.cpython-37.pyc b/NeuralSeq/modules/diff/__pycache__/shallow_diffusion_tts.cpython-37.pyc similarity index 100% rename from NeuralSeq/usr/diff/__pycache__/shallow_diffusion_tts.cpython-37.pyc rename to NeuralSeq/modules/diff/__pycache__/shallow_diffusion_tts.cpython-37.pyc diff --git a/NeuralSeq/usr/diff/__pycache__/shallow_diffusion_tts.cpython-38.pyc b/NeuralSeq/modules/diff/__pycache__/shallow_diffusion_tts.cpython-38.pyc similarity index 100% rename from NeuralSeq/usr/diff/__pycache__/shallow_diffusion_tts.cpython-38.pyc rename to NeuralSeq/modules/diff/__pycache__/shallow_diffusion_tts.cpython-38.pyc diff --git a/NeuralSeq/usr/diff/candidate_decoder.py b/NeuralSeq/modules/diff/candidate_decoder.py similarity index 100% rename from NeuralSeq/usr/diff/candidate_decoder.py rename to NeuralSeq/modules/diff/candidate_decoder.py diff --git a/NeuralSeq/usr/diff/diffusion.py b/NeuralSeq/modules/diff/diffusion.py similarity index 100% rename from NeuralSeq/usr/diff/diffusion.py rename to NeuralSeq/modules/diff/diffusion.py diff --git a/NeuralSeq/usr/diff/net.py b/NeuralSeq/modules/diff/net.py similarity index 100% rename from NeuralSeq/usr/diff/net.py rename to NeuralSeq/modules/diff/net.py diff --git a/NeuralSeq/usr/diff/shallow_diffusion_tts.py b/NeuralSeq/modules/diff/shallow_diffusion_tts.py similarity index 100% rename from NeuralSeq/usr/diff/shallow_diffusion_tts.py rename to NeuralSeq/modules/diff/shallow_diffusion_tts.py diff --git a/NeuralSeq/modules/portaspeech/fs.py b/NeuralSeq/modules/portaspeech/fs.py index c1e1b1a..477cc65 100755 --- a/NeuralSeq/modules/portaspeech/fs.py +++ b/NeuralSeq/modules/portaspeech/fs.py @@ -3,8 +3,8 @@ import torch from torch import nn import torch.nn.functional as F from modules.commons.conv import TextConvEncoder, ConvBlocks -from modules.commons.common_layers import Embedding, LayerNorm_ -from modules.fastspeech.tts_modules import PitchPredictor, LengthRegulator +from modules.commons.common_layers import Embedding +from modules.fastspeech.tts_modules import LayerNorm, PitchPredictor, LengthRegulator from modules.commons.rel_transformer import RelTransformerEncoder, BERTRelTransformerEncoder from modules.commons.align_ops import clip_mel2token_to_multiple, expand_states from utils.pitch_utils import denorm_f0, f0_to_coarse @@ -34,7 +34,7 @@ class DurationPredictor(torch.nn.Module): self.conv += [torch.nn.Sequential( torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2), torch.nn.ReLU(), - LayerNorm_(n_chans, dim=1), + LayerNorm(n_chans, dim=1), torch.nn.Dropout(dropout_rate) )] self.linear = nn.Sequential(torch.nn.Linear(n_chans, 1), nn.Softplus()) diff --git a/NeuralSeq/usr/__init__.py b/NeuralSeq/tasks/svs/__init__.py similarity index 100% rename from NeuralSeq/usr/__init__.py rename to NeuralSeq/tasks/svs/__init__.py diff --git a/NeuralSeq/usr/diffsinger_task.py b/NeuralSeq/tasks/svs/diffsinger_task.py similarity index 98% rename from NeuralSeq/usr/diffsinger_task.py rename to NeuralSeq/tasks/svs/diffsinger_task.py index ab586e1..78e6544 100644 --- a/NeuralSeq/usr/diffsinger_task.py +++ b/NeuralSeq/tasks/svs/diffsinger_task.py @@ -2,16 +2,16 @@ import torch import utils from utils.hparams import hparams -from .diff.net import DiffNet -from .diff.shallow_diffusion_tts import GaussianDiffusion, OfflineGaussianDiffusion -from .diffspeech_task import DiffSpeechTask +from modules.diff.net import DiffNet +from modules.diff.shallow_diffusion_tts import GaussianDiffusion, OfflineGaussianDiffusion +from tasks.svs.diffspeech_task import DiffSpeechTask from vocoders.base_vocoder import get_vocoder_cls, BaseVocoder from modules.fastspeech.pe import PitchExtractor from modules.fastspeech.fs2 import FastSpeech2 from modules.diffsinger_midi.fs2 import FastSpeech2MIDI from modules.fastspeech.tts_modules import mel2ph_to_dur -from usr.diff.candidate_decoder import FFT +from modules.diff.candidate_decoder import FFT from utils.pitch_utils import denorm_f0 from tasks.tts.fs2_utils import FastSpeechDataset from tasks.tts.fs2 import FastSpeech2Task diff --git a/NeuralSeq/usr/diffspeech_task.py b/NeuralSeq/tasks/svs/diffspeech_task.py similarity index 97% rename from NeuralSeq/usr/diffspeech_task.py rename to NeuralSeq/tasks/svs/diffspeech_task.py index 05c313f..cf303d5 100644 --- a/NeuralSeq/usr/diffspeech_task.py +++ b/NeuralSeq/tasks/svs/diffspeech_task.py @@ -2,9 +2,9 @@ import torch import utils from utils.hparams import hparams -from .diff.net import DiffNet -from .diff.shallow_diffusion_tts import GaussianDiffusion -from .task import DiffFsTask +from modules.diff.net import DiffNet +from modules.diff.shallow_diffusion_tts import GaussianDiffusion +from tasks.svs.task import DiffFsTask from vocoders.base_vocoder import get_vocoder_cls, BaseVocoder from utils.pitch_utils import denorm_f0 from tasks.tts.fs2_utils import FastSpeechDataset diff --git a/NeuralSeq/usr/task.py b/NeuralSeq/tasks/svs/task.py similarity index 97% rename from NeuralSeq/usr/task.py rename to NeuralSeq/tasks/svs/task.py index 3e34db2..896970e 100644 --- a/NeuralSeq/usr/task.py +++ b/NeuralSeq/tasks/svs/task.py @@ -1,8 +1,8 @@ import torch import utils -from .diff.diffusion import GaussianDiffusion -from .diff.net import DiffNet +from modules.diff.diffusion import GaussianDiffusion +from modules.diff.net import DiffNet from tasks.tts.fs2 import FastSpeech2Task from utils.hparams import hparams diff --git a/NeuralSeq/tasks/tts/fs2.py b/NeuralSeq/tasks/tts/fs2.py index 2e06771..620d7f3 100644 --- a/NeuralSeq/tasks/tts/fs2.py +++ b/NeuralSeq/tasks/tts/fs2.py @@ -1,7 +1,5 @@ import matplotlib - matplotlib.use('Agg') - from utils import audio import matplotlib.pyplot as plt from data_gen.tts.data_gen_utils import get_pitch diff --git a/audio-chatgpt.py b/audio-chatgpt.py index 56a3ead..47884e3 100644 --- a/audio-chatgpt.py +++ b/audio-chatgpt.py @@ -38,7 +38,7 @@ from audio_to_text.inference_waveform import AudioCapModel import whisper from inference.svs.ds_e2e import DiffSingerE2EInfer from inference.tts.GenerSpeech import GenerSpeechInfer -from inference.tts.SyntaSpeech import TTSInference +from inference.tts.PortaSpeech import TTSInference from utils.hparams import set_hparams from utils.hparams import hparams as hp import scipy.io.wavfile as wavfile @@ -282,7 +282,7 @@ class T2S: print("Initializing DiffSinger to %s" % device) self.device = device self.exp_name = 'checkpoints/0831_opencpop_ds1000' - self.config= 'NeuralSeq/usr/configs/midi/e2e/opencpop/ds1000.yaml' + self.config= 'NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000.yaml' self.set_model_hparams() self.pipe = DiffSingerE2EInfer(self.hp, device) self.default_inp = {