diff --git a/NeuralSeq/usr/configs/base.yaml b/NeuralSeq/egs/egs_bases/svs/base.yaml
similarity index 92%
rename from NeuralSeq/usr/configs/base.yaml
rename to NeuralSeq/egs/egs_bases/svs/base.yaml
index f5c13e6..ec30695 100644
--- a/NeuralSeq/usr/configs/base.yaml
+++ b/NeuralSeq/egs/egs_bases/svs/base.yaml
@@ -1,4 +1,4 @@
-task_cls: usr.task.DiffFsTask
+task_cls: tasks.svs.task.DiffFsTask
pitch_type: frame
timesteps: 100
dilation_cycle_length: 1
diff --git a/NeuralSeq/usr/configs/lj_ds_beta6.yaml b/NeuralSeq/egs/egs_bases/svs/lj_ds_beta6.yaml
similarity index 97%
rename from NeuralSeq/usr/configs/lj_ds_beta6.yaml
rename to NeuralSeq/egs/egs_bases/svs/lj_ds_beta6.yaml
index a0fcebb..7fd443c 100644
--- a/NeuralSeq/usr/configs/lj_ds_beta6.yaml
+++ b/NeuralSeq/egs/egs_bases/svs/lj_ds_beta6.yaml
@@ -23,7 +23,7 @@ spec_max: [ -0.5982, -0.0778, 0.1205, 0.2747, 0.4657, 0.5123, 0.5684, 0.70
0.0086, -0.0698, 0.1385, 0.0941, 0.1864, 0.1225, 0.2176, 0.2566,
0.1670, 0.1007, 0.1444, 0.0888, 0.1998, 0.2414, 0.2932, 0.3047 ]
-task_cls: usr.diffspeech_task.DiffSpeechTask
+task_cls: tasks.svs.diffspeech_task.DiffSpeechTask
vocoder: vocoders.hifigan.HifiGAN
vocoder_ckpt: checkpoints/0414_hifi_lj_1
num_valid_plots: 10
diff --git a/NeuralSeq/usr/configs/midi/cascade/opencs/aux_rel.yaml b/NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/aux_rel.yaml
similarity index 87%
rename from NeuralSeq/usr/configs/midi/cascade/opencs/aux_rel.yaml
rename to NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/aux_rel.yaml
index e8267c0..5c204cb 100644
--- a/NeuralSeq/usr/configs/midi/cascade/opencs/aux_rel.yaml
+++ b/NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/aux_rel.yaml
@@ -1,6 +1,6 @@
base_config:
- configs/singing/fs2.yaml
- - usr/configs/midi/cascade/opencs/opencpop_statis.yaml
+ - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
audio_sample_rate: 24000
hop_size: 128 # Hop size.
@@ -42,8 +42,8 @@ test_prefixes: [
'2100',
]
-task_cls: usr.diffsinger_task.AuxDecoderMIDITask
-#vocoder: usr.singingvocoder.highgan.HighGAN
+task_cls: tasks.svs.diffsinger_task.AuxDecoderMIDITask
+#vocoder: tasks.svs.singingvocoder.highgan.HighGAN
#vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl
vocoder: vocoders.hifigan.HifiGAN
vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
diff --git a/NeuralSeq/usr/configs/midi/cascade/opencs/ds60_rel.yaml b/NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/ds60_rel.yaml
similarity index 81%
rename from NeuralSeq/usr/configs/midi/cascade/opencs/ds60_rel.yaml
rename to NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/ds60_rel.yaml
index 7f1b6bd..820a608 100644
--- a/NeuralSeq/usr/configs/midi/cascade/opencs/ds60_rel.yaml
+++ b/NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/ds60_rel.yaml
@@ -1,6 +1,6 @@
base_config:
- - usr/configs/popcs_ds_beta6.yaml
- - usr/configs/midi/cascade/opencs/opencpop_statis.yaml
+ - egs/egs_bases/svs/popcs_ds_beta6.yaml
+ - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
binary_data_dir: 'data/binary/opencpop-midi-dp'
@@ -21,7 +21,7 @@ pe_ckpt: ''
fs2_ckpt: 'checkpoints/0302_opencpop_fs_midi/model_ckpt_steps_160000.ckpt' #
#num_valid_plots: 0
-task_cls: usr.diffsinger_task.DiffSingerMIDITask
+task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
K_step: 60
max_tokens: 36000
diff --git a/NeuralSeq/usr/configs/midi/cascade/opencs/opencpop_statis.yaml b/NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
similarity index 100%
rename from NeuralSeq/usr/configs/midi/cascade/opencs/opencpop_statis.yaml
rename to NeuralSeq/egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
diff --git a/NeuralSeq/usr/configs/midi/e2e/opencpop/ds1000-10dil.yaml b/NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000-10dil.yaml
similarity index 81%
rename from NeuralSeq/usr/configs/midi/e2e/opencpop/ds1000-10dil.yaml
rename to NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000-10dil.yaml
index 4443ca7..8825430 100644
--- a/NeuralSeq/usr/configs/midi/e2e/opencpop/ds1000-10dil.yaml
+++ b/NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000-10dil.yaml
@@ -1,6 +1,6 @@
base_config:
- - usr/configs/popcs_ds_beta6.yaml
- - usr/configs/midi/cascade/opencs/opencpop_statis.yaml
+ - egs/egs_bases/svs/popcs_ds_beta6.yaml
+ - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
binary_data_dir: 'data/binary/opencpop-midi-dp'
@@ -17,7 +17,7 @@ dur_predictor_layers: 5 # *
fs2_ckpt: '' #
#num_valid_plots: 0
-task_cls: usr.diffsinger_task.DiffSingerMIDITask
+task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
timesteps: 1000
K_step: 1000
diff --git a/NeuralSeq/usr/configs/midi/e2e/opencpop/ds1000.yaml b/NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000.yaml
similarity index 81%
rename from NeuralSeq/usr/configs/midi/e2e/opencpop/ds1000.yaml
rename to NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000.yaml
index 1feb508..3c942cd 100644
--- a/NeuralSeq/usr/configs/midi/e2e/opencpop/ds1000.yaml
+++ b/NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000.yaml
@@ -1,6 +1,6 @@
base_config:
- - usr/configs/popcs_ds_beta6.yaml
- - usr/configs/midi/cascade/opencs/opencpop_statis.yaml
+ - egs/egs_bases/svs/popcs_ds_beta6.yaml
+ - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
binary_data_dir: 'data/binary/opencpop-midi-dp'
@@ -17,7 +17,7 @@ dur_predictor_layers: 5 # *
fs2_ckpt: '' #
#num_valid_plots: 0
-task_cls: usr.diffsinger_task.DiffSingerMIDITask
+task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
# for diffusion schedule
timesteps: 1000
diff --git a/NeuralSeq/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml b/NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds100_adj_rel.yaml
similarity index 80%
rename from NeuralSeq/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml
rename to NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds100_adj_rel.yaml
index 5ff1b00..3cdda09 100644
--- a/NeuralSeq/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml
+++ b/NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds100_adj_rel.yaml
@@ -1,6 +1,6 @@
base_config:
- - usr/configs/popcs_ds_beta6.yaml
- - usr/configs/midi/cascade/opencs/opencpop_statis.yaml
+ - egs/egs_bases/svs/popcs_ds_beta6.yaml
+ - egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
binary_data_dir: 'data/binary/opencpop-midi-dp'
@@ -17,7 +17,7 @@ dur_predictor_layers: 5 # *
fs2_ckpt: '' #
#num_valid_plots: 0
-task_cls: usr.diffsinger_task.DiffSingerMIDITask
+task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
K_step: 100
max_tokens: 36000
diff --git a/NeuralSeq/usr/configs/midi/e2e/popcs/ds100_adj_rel.yaml b/NeuralSeq/egs/egs_bases/svs/midi/e2e/popcs/ds100_adj_rel.yaml
similarity index 80%
rename from NeuralSeq/usr/configs/midi/e2e/popcs/ds100_adj_rel.yaml
rename to NeuralSeq/egs/egs_bases/svs/midi/e2e/popcs/ds100_adj_rel.yaml
index 7380923..b51b4f2 100644
--- a/NeuralSeq/usr/configs/midi/e2e/popcs/ds100_adj_rel.yaml
+++ b/NeuralSeq/egs/egs_bases/svs/midi/e2e/popcs/ds100_adj_rel.yaml
@@ -1,6 +1,6 @@
base_config:
- - usr/configs/popcs_ds_beta6.yaml
- - usr/configs/midi/cascade/popcs/popcs_statis.yaml
+ - egs/egs_bases/svs/popcs_ds_beta6.yaml
+ - egs/egs_bases/svs/midi/cascade/popcs/popcs_statis.yaml
binarizer_cls: data_gen.singing.binarize.MidiSingingBinarizer
binary_data_dir: 'data/binary/popcs-midi-dp'
@@ -17,7 +17,7 @@ dur_predictor_layers: 5 # *
fs2_ckpt: '' #
#num_valid_plots: 0
-task_cls: usr.diffsinger_task.DiffSingerMIDITask
+task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
K_step: 100
max_tokens: 40000
diff --git a/NeuralSeq/usr/configs/midi/pe.yaml b/NeuralSeq/egs/egs_bases/svs/midi/pe.yaml
similarity index 100%
rename from NeuralSeq/usr/configs/midi/pe.yaml
rename to NeuralSeq/egs/egs_bases/svs/midi/pe.yaml
diff --git a/NeuralSeq/usr/configs/popcs_ds_beta6.yaml b/NeuralSeq/egs/egs_bases/svs/popcs_ds_beta6.yaml
similarity index 96%
rename from NeuralSeq/usr/configs/popcs_ds_beta6.yaml
rename to NeuralSeq/egs/egs_bases/svs/popcs_ds_beta6.yaml
index 699e9f7..000c941 100644
--- a/NeuralSeq/usr/configs/popcs_ds_beta6.yaml
+++ b/NeuralSeq/egs/egs_bases/svs/popcs_ds_beta6.yaml
@@ -48,8 +48,8 @@ spec_max: [ 0.2645, 0.0583, -0.2344, -0.0184, 0.1227, 0.1533, 0.1103, 0.121
-0.8770, -0.9520, -0.8749, -0.7297, -0.8374, -0.8667, -0.7157, -0.9035,
-0.9219, -0.8801, -0.9298, -0.9009, -0.9604, -1.0537, -1.0781, -1.3766]
-task_cls: usr.diffsinger_task.DiffSingerTask
-#vocoder: usr.singingvocoder.highgan.HighGAN
+task_cls: tasks.svs.diffsinger_task.DiffSingerTask
+#vocoder: tasks.svs.singingvocoder.highgan.HighGAN
#vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl
vocoder: vocoders.hifigan.HifiGAN
vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
diff --git a/NeuralSeq/usr/configs/popcs_ds_beta6_offline.yaml b/NeuralSeq/egs/egs_bases/svs/popcs_ds_beta6_offline.yaml
similarity index 82%
rename from NeuralSeq/usr/configs/popcs_ds_beta6_offline.yaml
rename to NeuralSeq/egs/egs_bases/svs/popcs_ds_beta6_offline.yaml
index 84d15d9..8628623 100644
--- a/NeuralSeq/usr/configs/popcs_ds_beta6_offline.yaml
+++ b/NeuralSeq/egs/egs_bases/svs/popcs_ds_beta6_offline.yaml
@@ -3,7 +3,7 @@ base_config:
fs2_ckpt: checkpoints/popcs_fs2_pmf0_1230/model_ckpt_steps_160000.ckpt # to be infer
num_valid_plots: 0
-task_cls: usr.diffsinger_task.DiffSingerOfflineTask
+task_cls: tasks.svs.diffsinger_task.DiffSingerOfflineTask
# tmp:
#pe_enable: true
diff --git a/NeuralSeq/usr/configs/popcs_fs2.yaml b/NeuralSeq/egs/egs_bases/svs/popcs_fs2.yaml
similarity index 94%
rename from NeuralSeq/usr/configs/popcs_fs2.yaml
rename to NeuralSeq/egs/egs_bases/svs/popcs_fs2.yaml
index a6b08e9..a9f1a7a 100644
--- a/NeuralSeq/usr/configs/popcs_fs2.yaml
+++ b/NeuralSeq/egs/egs_bases/svs/popcs_fs2.yaml
@@ -26,7 +26,7 @@ test_prefixes: [
]
task_cls: tasks.tts.fs2.FastSpeech2Task
-#vocoder: usr.singingvocoder.highgan.HighGAN
+#vocoder: tasks.svs.singingvocoder.highgan.HighGAN
#vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl
vocoder: vocoders.hifigan.HifiGAN
vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128
diff --git a/NeuralSeq/inference/svs/ds_cascade.py b/NeuralSeq/inference/svs/ds_cascade.py
index c62fdd4..f0ec5ed 100644
--- a/NeuralSeq/inference/svs/ds_cascade.py
+++ b/NeuralSeq/inference/svs/ds_cascade.py
@@ -2,8 +2,8 @@ import torch
from inference.svs.base_svs_infer import BaseSVSInfer
from utils import load_ckpt
from utils.hparams import hparams
-from usr.diff.shallow_diffusion_tts import GaussianDiffusion
-from usr.diffsinger_task import DIFF_DECODERS
+from modulesmodules.diff.shallow_diffusion_tts import GaussianDiffusion
+from tasks.svs.diffsinger_task import DIFF_DECODERS
class DiffSingerCascadeInfer(BaseSVSInfer):
def build_model(self):
@@ -51,4 +51,4 @@ if __name__ == '__main__':
} # input like Opencpop dataset.
DiffSingerCascadeInfer.example_run(inp)
-# # CUDA_VISIBLE_DEVICES=1 python inference/svs/ds_cascade.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi
\ No newline at end of file
+# # CUDA_VISIBLE_DEVICES=1 python inference/svs/ds_cascade.py --config egs/egs_bases/svs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi
\ No newline at end of file
diff --git a/NeuralSeq/inference/svs/ds_e2e.py b/NeuralSeq/inference/svs/ds_e2e.py
index 68590a8..3b2b9ad 100644
--- a/NeuralSeq/inference/svs/ds_e2e.py
+++ b/NeuralSeq/inference/svs/ds_e2e.py
@@ -4,8 +4,8 @@ import torch
from inference.svs.base_svs_infer import BaseSVSInfer
from utils import load_ckpt
from utils.hparams import hparams
-from usr.diff.shallow_diffusion_tts import GaussianDiffusion
-from usr.diffsinger_task import DIFF_DECODERS
+from modulesmodules.diff.shallow_diffusion_tts import GaussianDiffusion
+from tasks.svs.diffsinger_task import DIFF_DECODERS
from modules.fastspeech.pe import PitchExtractor
import utils
@@ -64,4 +64,4 @@ if __name__ == '__main__':
DiffSingerE2EInfer.example_run(inp)
-# CUDA_VISIBLE_DEVICES=3 python inference/svs/ds_e2e.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name 0228_opencpop_ds100_rel
\ No newline at end of file
+# CUDA_VISIBLE_DEVICES=3 python inference/svs/ds_e2e.py --config egs/egs_bases/svs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name 0228_opencpop_ds100_rel
\ No newline at end of file
diff --git a/NeuralSeq/inference/svs/gradio/gradio_settings.yaml b/NeuralSeq/inference/svs/gradio/gradio_settings.yaml
deleted file mode 100644
index d0b3666..0000000
--- a/NeuralSeq/inference/svs/gradio/gradio_settings.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-title: 'DiffSinger'
-description: |
- This model is trained on 5 hours single female singing voice samples of Opencpop dataset. (该模型在开源数据集Opencpop的5小时单人歌声上训练。)
-
- Please assign pitch and duration values to each Chinese character. The corresponding pitch and duration value of each character should be separated by a | separator. It is necessary to ensure that the note window separated by the separator is consistent with the number of Chinese characters (AP or SP is also viewed as a Chinese character). (请给每个汉字分配音高和时值, 每个字对应的音高和时值需要用|分隔符隔开。需要保证分隔符分割出来的音符窗口与汉字个数(AP或SP也算一个汉字)一致。)
-
- You can click one of the examples to load them. (你可以点击下方示例,加载示例曲谱。)
-
- Note: This space is running on CPU. (该Demo是在Huggingface提供的CPU上运行的, 其推理速度在本地会更快一些。)
-
-article: |
- Link to Github REPO
-example_inputs:
- - |-
- 你 说 你 不 SP 懂 为 何 在 这 时 牵 手 APD#4/Eb4 | D#4/Eb4 | D#4/Eb4 | D#4/Eb4 | rest | D#4/Eb4 | D4 | D4 | D4 | D#4/Eb4 | F4 | D#4/Eb4 | D4 | rest0.113740 | 0.329060 | 0.287950 | 0.133480 | 0.150900 | 0.484730 | 0.242010 | 0.180820 | 0.343570 | 0.152050 | 0.266720 | 0.280310 | 0.633300 | 0.444590
- - |-
- 小酒窝长睫毛AP是你最美的记号C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db40.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340
- - |-
- 我真的SP爱你SP句句不轻易D4 | A4 | F#4 | rest | A4 | D4 | rest | B4 | A4 F#4 | F#4 | A4 | A40.8 | 0.4 | 0.967 | 0.3 | 0.4 | 0.967 | 0.4 | 0.8 | 0.4 0.4 | 0.25 | 0.967 | 0.9
- - |-
- 好冷啊 AP 我在东北玩泥巴F4 | F4 | D4 | rest | D4 | D4 | C4 | C4 | B3 | C4 | D40.5 | 0.3 | 0.3 | 0.3 | 0.2 | 0.2 | 0.2 | 0.2 | 0.25 | 0.25 | 0.4
-
-#inference_cls: inference.svs.ds_cascade.DiffSingerCascadeInfer
-#exp_name: 0303_opencpop_ds58_midi
-
-inference_cls: inference.svs.ds_e2e.DiffSingerE2EInfer
-exp_name: 0831_opencpop_ds1000
\ No newline at end of file
diff --git a/NeuralSeq/inference/svs/gradio/infer.py b/NeuralSeq/inference/svs/gradio/infer.py
deleted file mode 100644
index b8de498..0000000
--- a/NeuralSeq/inference/svs/gradio/infer.py
+++ /dev/null
@@ -1,91 +0,0 @@
-import importlib
-import re
-
-import gradio as gr
-import yaml
-from gradio.inputs import Textbox
-
-from inference.svs.base_svs_infer import BaseSVSInfer
-from utils.hparams import set_hparams
-from utils.hparams import hparams as hp
-import numpy as np
-
-
-class GradioInfer:
- def __init__(self, exp_name, inference_cls, title, description, article, example_inputs):
- self.exp_name = exp_name
- self.title = title
- self.description = description
- self.article = article
- self.example_inputs = example_inputs
- pkg = ".".join(inference_cls.split(".")[:-1])
- cls_name = inference_cls.split(".")[-1]
- self.inference_cls = getattr(importlib.import_module(pkg), cls_name)
-
- def greet(self, text, notes, notes_duration):
- PUNCS = '。?;:'
- sents = re.split(rf'([{PUNCS}])', text.replace('\n', ','))
- sents_notes = re.split(rf'([{PUNCS}])', notes.replace('\n', ','))
- sents_notes_dur = re.split(rf'([{PUNCS}])', notes_duration.replace('\n', ','))
-
- if sents[-1] not in list(PUNCS):
- sents = sents + ['']
- sents_notes = sents_notes + ['']
- sents_notes_dur = sents_notes_dur + ['']
-
- audio_outs = []
- s, n, n_dur = "", "", ""
- for i in range(0, len(sents), 2):
- if len(sents[i]) > 0:
- s += sents[i] + sents[i + 1]
- n += sents_notes[i] + sents_notes[i+1]
- n_dur += sents_notes_dur[i] + sents_notes_dur[i+1]
- if len(s) >= 400 or (i >= len(sents) - 2 and len(s) > 0):
- audio_out = self.infer_ins.infer_once({
- 'text': s,
- 'notes': n,
- 'notes_duration': n_dur,
- })
- audio_out = audio_out * 32767
- audio_out = audio_out.astype(np.int16)
- audio_outs.append(audio_out)
- audio_outs.append(np.zeros(int(hp['audio_sample_rate'] * 0.3)).astype(np.int16))
- s = ""
- n = ""
- audio_outs = np.concatenate(audio_outs)
- return hp['audio_sample_rate'], audio_outs
-
- def run(self):
- set_hparams(exp_name=self.exp_name, print_hparams=False)
- infer_cls = self.inference_cls
- self.infer_ins: BaseSVSInfer = infer_cls(hp)
- example_inputs = self.example_inputs
- for i in range(len(example_inputs)):
- text, notes, notes_dur = example_inputs[i].split('')
- example_inputs[i] = [text, notes, notes_dur]
-
- iface = gr.Interface(fn=self.greet,
- inputs=[
- Textbox(lines=2, placeholder=None, default=example_inputs[0][0], label="input text"),
- Textbox(lines=2, placeholder=None, default=example_inputs[0][1], label="input note"),
- Textbox(lines=2, placeholder=None, default=example_inputs[0][2], label="input duration")]
- ,
- outputs="audio",
- allow_flagging="never",
- title=self.title,
- description=self.description,
- article=self.article,
- examples=example_inputs,
- enable_queue=True)
- iface.launch(share=True,)# cache_examples=True)
-
-
-if __name__ == '__main__':
- gradio_config = yaml.safe_load(open('inference/svs/gradio/gradio_settings.yaml'))
- g = GradioInfer(**gradio_config)
- g.run()
-
-
-# python inference/svs/gradio/infer.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi
-# python inference/svs/ds_cascade.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi
-# CUDA_VISIBLE_DEVICES=3 python inference/svs/gradio/infer.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name 0228_opencpop_ds100_rel
\ No newline at end of file
diff --git a/NeuralSeq/inference/tts/GenerSpeech.py b/NeuralSeq/inference/tts/GenerSpeech.py
index 689fe20..5052733 100644
--- a/NeuralSeq/inference/tts/GenerSpeech.py
+++ b/NeuralSeq/inference/tts/GenerSpeech.py
@@ -1,12 +1,15 @@
import torch
+import os
+import importlib
from inference.tts.base_tts_infer import BaseTTSInfer
from utils.ckpt_utils import load_ckpt, get_last_checkpoint
from modules.GenerSpeech.model.generspeech import GenerSpeech
-import os
from data_gen.tts.emotion import inference as EmotionEncoder
from data_gen.tts.emotion.inference import embed_utterance as Embed_utterance
from data_gen.tts.emotion.inference import preprocess_wav
-
+from data_gen.tts.data_gen_utils import is_sil_phoneme
+from resemblyzer import VoiceEncoder
+from utils import audio
class GenerSpeechInfer(BaseTTSInfer):
def build_model(self):
model = GenerSpeech(self.ph_encoder)
diff --git a/NeuralSeq/inference/tts/SyntaSpeech.py b/NeuralSeq/inference/tts/PortaSpeech.py
similarity index 100%
rename from NeuralSeq/inference/tts/SyntaSpeech.py
rename to NeuralSeq/inference/tts/PortaSpeech.py
diff --git a/NeuralSeq/inference/tts/base_tts_infer.py b/NeuralSeq/inference/tts/base_tts_infer.py
index eb21517..6478f91 100644
--- a/NeuralSeq/inference/tts/base_tts_infer.py
+++ b/NeuralSeq/inference/tts/base_tts_infer.py
@@ -1,6 +1,3 @@
-from data_gen.tts.data_gen_utils import is_sil_phoneme
-from resemblyzer import VoiceEncoder
-from data_gen.tts.data_gen_utils import build_phone_encoder, build_word_encoder
from tasks.tts.dataset_utils import FastSpeechWordDataset
from tasks.tts.tts_utils import load_data_preprocessor
from vocoders.hifigan import HifiGanGenerator
diff --git a/NeuralSeq/inference/tts/gradio/gradio_settings.yaml b/NeuralSeq/inference/tts/gradio/gradio_settings.yaml
deleted file mode 100644
index 778b670..0000000
--- a/NeuralSeq/inference/tts/gradio/gradio_settings.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-title: 'Rongjiehuang/GenerSpeech'
-description: |
- Gradio demo for Rongjiehuang/GenerSpeech. To use it, simply add your audio, or click one of the examples to load them.
-article: |
- Link to Github REPO
-example_inputs:
- - |-
- the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.
- - |-
- produced the block books, which were the immediate predecessors of the true printed book,
-inference_cls: inference.GenerSpeech.GenerSpeechInfer
-exp_name: GenerSpeech
-config: modules/GenerSpeech/config/prodiff_teacher.yaml
\ No newline at end of file
diff --git a/NeuralSeq/inference/tts/gradio/infer.py b/NeuralSeq/inference/tts/gradio/infer.py
deleted file mode 100644
index 2baff8d..0000000
--- a/NeuralSeq/inference/tts/gradio/infer.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import importlib
-import re
-
-import gradio as gr
-import yaml
-from gradio.inputs import Textbox, Audio
-
-from inference.base_tts_infer import BaseTTSInfer
-from utils.hparams import set_hparams
-from utils.hparams import hparams as hp
-import numpy as np
-
-from data_gen.tts.data_gen_utils import is_sil_phoneme, PUNCS
-
-class GradioInfer:
- def __init__(self, exp_name, config, inference_cls, title, description, article, example_inputs):
- self.exp_name = exp_name
- self.config = config
- self.title = title
- self.description = description
- self.article = article
- self.example_inputs = example_inputs
- pkg = ".".join(inference_cls.split(".")[:-1])
- cls_name = inference_cls.split(".")[-1]
- self.inference_cls = getattr(importlib.import_module(pkg), cls_name)
-
- def greet(self, text, audio):
- sents = re.split(rf'([{PUNCS}])', text.replace('\n', ','))
- if sents[-1] not in list(PUNCS):
- sents = sents + ['.']
- audio_outs = []
- s = ""
- for i in range(0, len(sents), 2):
- if len(sents[i]) > 0:
- s += sents[i] + sents[i + 1]
- if len(s) >= 400 or (i >= len(sents) - 2 and len(s) > 0):
- audio_out = self.infer_ins.infer_once({
- 'text': s,
- 'ref_audio': audio
- })
- audio_out = audio_out * 32767
- audio_out = audio_out.astype(np.int16)
- audio_outs.append(audio_out)
- audio_outs.append(np.zeros(int(hp['audio_sample_rate'] * 0.3)).astype(np.int16))
- s = ""
- audio_outs = np.concatenate(audio_outs)
- return hp['audio_sample_rate'], audio_outs
-
- def run(self):
- set_hparams(exp_name=self.exp_name, config=self.config)
- infer_cls = self.inference_cls
- self.infer_ins: BaseTTSInfer = infer_cls(hp)
- example_inputs = self.example_inputs
- iface = gr.Interface(fn=self.greet,
- inputs=[
- Textbox(lines=10, placeholder=None, default=example_inputs[0], label="input text"),
- Audio(label="reference audio"),
- ],
- outputs="audio",
- allow_flagging="never",
- title=self.title,
- description=self.description,
- article=self.article,
- examples=example_inputs,
- enable_queue=True)
- iface.launch()
-
-
-if __name__ == '__main__':
- gradio_config = yaml.safe_load(open('inference/gradio/gradio_settings.yaml'))
- g = GradioInfer(**gradio_config)
- g.run()
diff --git a/NeuralSeq/modules/commons/common_layers.py b/NeuralSeq/modules/commons/common_layers.py
index 8020221..0657b0d 100644
--- a/NeuralSeq/modules/commons/common_layers.py
+++ b/NeuralSeq/modules/commons/common_layers.py
@@ -76,26 +76,6 @@ def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False)
pass
return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
-class LayerNorm_(torch.nn.LayerNorm):
- """Layer normalization module.
- :param int nout: output dim size
- :param int dim: dimension to be normalized
- """
-
- def __init__(self, nout, dim=-1, eps=1e-5):
- """Construct an LayerNorm object."""
- super(LayerNorm_, self).__init__(nout, eps=eps)
- self.dim = dim
-
- def forward(self, x):
- """Apply layer normalization.
- :param torch.Tensor x: input tensor
- :return: layer normalized tensor
- :rtype torch.Tensor
- """
- if self.dim == -1:
- return super(LayerNorm_, self).forward(x)
- return super(LayerNorm_, self).forward(x.transpose(1, -1)).transpose(1, -1)
def Linear(in_features, out_features, bias=True):
m = nn.Linear(in_features, out_features, bias)
nn.init.xavier_uniform_(m.weight)
diff --git a/NeuralSeq/modules/commons/conv.py b/NeuralSeq/modules/commons/conv.py
index f2d80ad..a86505f 100644
--- a/NeuralSeq/modules/commons/conv.py
+++ b/NeuralSeq/modules/commons/conv.py
@@ -3,7 +3,8 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
-from modules.commons.common_layers import LayerNorm_, Embedding
+from modules.commons.common_layers import Embedding
+from modules.fastspeech.tts_modules import LayerNorm
class LambdaLayer(nn.Module):
@@ -35,7 +36,7 @@ class ResidualBlock(nn.Module):
elif norm_type == 'gn':
norm_builder = lambda: nn.GroupNorm(8, channels)
elif norm_type == 'ln':
- norm_builder = lambda: LayerNorm_(channels, eps=ln_eps)
+ norm_builder = lambda: LayerNorm(channels, dim=1, eps=ln_eps)
else:
norm_builder = lambda: nn.Identity()
@@ -89,7 +90,7 @@ class ConvBlocks(nn.Module):
elif norm_type == 'gn':
norm = nn.GroupNorm(8, hidden_size)
elif norm_type == 'ln':
- norm = LayerNorm_(hidden_size, eps=ln_eps)
+ norm = LayerNorm(hidden_size, dim=1, eps=ln_eps)
self.last_norm = norm
self.post_net1 = nn.Conv1d(hidden_size, out_dims, kernel_size=post_net_kernel,
padding=post_net_kernel // 2)
diff --git a/NeuralSeq/usr/diff/__pycache__/candidate_decoder.cpython-37.pyc b/NeuralSeq/modules/diff/__pycache__/candidate_decoder.cpython-37.pyc
similarity index 100%
rename from NeuralSeq/usr/diff/__pycache__/candidate_decoder.cpython-37.pyc
rename to NeuralSeq/modules/diff/__pycache__/candidate_decoder.cpython-37.pyc
diff --git a/NeuralSeq/usr/diff/__pycache__/candidate_decoder.cpython-38.pyc b/NeuralSeq/modules/diff/__pycache__/candidate_decoder.cpython-38.pyc
similarity index 100%
rename from NeuralSeq/usr/diff/__pycache__/candidate_decoder.cpython-38.pyc
rename to NeuralSeq/modules/diff/__pycache__/candidate_decoder.cpython-38.pyc
diff --git a/NeuralSeq/usr/diff/__pycache__/diffusion.cpython-37.pyc b/NeuralSeq/modules/diff/__pycache__/diffusion.cpython-37.pyc
similarity index 100%
rename from NeuralSeq/usr/diff/__pycache__/diffusion.cpython-37.pyc
rename to NeuralSeq/modules/diff/__pycache__/diffusion.cpython-37.pyc
diff --git a/NeuralSeq/usr/diff/__pycache__/diffusion.cpython-38.pyc b/NeuralSeq/modules/diff/__pycache__/diffusion.cpython-38.pyc
similarity index 100%
rename from NeuralSeq/usr/diff/__pycache__/diffusion.cpython-38.pyc
rename to NeuralSeq/modules/diff/__pycache__/diffusion.cpython-38.pyc
diff --git a/NeuralSeq/usr/diff/__pycache__/net.cpython-37.pyc b/NeuralSeq/modules/diff/__pycache__/net.cpython-37.pyc
similarity index 100%
rename from NeuralSeq/usr/diff/__pycache__/net.cpython-37.pyc
rename to NeuralSeq/modules/diff/__pycache__/net.cpython-37.pyc
diff --git a/NeuralSeq/usr/diff/__pycache__/net.cpython-38.pyc b/NeuralSeq/modules/diff/__pycache__/net.cpython-38.pyc
similarity index 100%
rename from NeuralSeq/usr/diff/__pycache__/net.cpython-38.pyc
rename to NeuralSeq/modules/diff/__pycache__/net.cpython-38.pyc
diff --git a/NeuralSeq/usr/diff/__pycache__/shallow_diffusion_tts.cpython-37.pyc b/NeuralSeq/modules/diff/__pycache__/shallow_diffusion_tts.cpython-37.pyc
similarity index 100%
rename from NeuralSeq/usr/diff/__pycache__/shallow_diffusion_tts.cpython-37.pyc
rename to NeuralSeq/modules/diff/__pycache__/shallow_diffusion_tts.cpython-37.pyc
diff --git a/NeuralSeq/usr/diff/__pycache__/shallow_diffusion_tts.cpython-38.pyc b/NeuralSeq/modules/diff/__pycache__/shallow_diffusion_tts.cpython-38.pyc
similarity index 100%
rename from NeuralSeq/usr/diff/__pycache__/shallow_diffusion_tts.cpython-38.pyc
rename to NeuralSeq/modules/diff/__pycache__/shallow_diffusion_tts.cpython-38.pyc
diff --git a/NeuralSeq/usr/diff/candidate_decoder.py b/NeuralSeq/modules/diff/candidate_decoder.py
similarity index 100%
rename from NeuralSeq/usr/diff/candidate_decoder.py
rename to NeuralSeq/modules/diff/candidate_decoder.py
diff --git a/NeuralSeq/usr/diff/diffusion.py b/NeuralSeq/modules/diff/diffusion.py
similarity index 100%
rename from NeuralSeq/usr/diff/diffusion.py
rename to NeuralSeq/modules/diff/diffusion.py
diff --git a/NeuralSeq/usr/diff/net.py b/NeuralSeq/modules/diff/net.py
similarity index 100%
rename from NeuralSeq/usr/diff/net.py
rename to NeuralSeq/modules/diff/net.py
diff --git a/NeuralSeq/usr/diff/shallow_diffusion_tts.py b/NeuralSeq/modules/diff/shallow_diffusion_tts.py
similarity index 100%
rename from NeuralSeq/usr/diff/shallow_diffusion_tts.py
rename to NeuralSeq/modules/diff/shallow_diffusion_tts.py
diff --git a/NeuralSeq/modules/portaspeech/fs.py b/NeuralSeq/modules/portaspeech/fs.py
index c1e1b1a..477cc65 100755
--- a/NeuralSeq/modules/portaspeech/fs.py
+++ b/NeuralSeq/modules/portaspeech/fs.py
@@ -3,8 +3,8 @@ import torch
from torch import nn
import torch.nn.functional as F
from modules.commons.conv import TextConvEncoder, ConvBlocks
-from modules.commons.common_layers import Embedding, LayerNorm_
-from modules.fastspeech.tts_modules import PitchPredictor, LengthRegulator
+from modules.commons.common_layers import Embedding
+from modules.fastspeech.tts_modules import LayerNorm, PitchPredictor, LengthRegulator
from modules.commons.rel_transformer import RelTransformerEncoder, BERTRelTransformerEncoder
from modules.commons.align_ops import clip_mel2token_to_multiple, expand_states
from utils.pitch_utils import denorm_f0, f0_to_coarse
@@ -34,7 +34,7 @@ class DurationPredictor(torch.nn.Module):
self.conv += [torch.nn.Sequential(
torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2),
torch.nn.ReLU(),
- LayerNorm_(n_chans, dim=1),
+ LayerNorm(n_chans, dim=1),
torch.nn.Dropout(dropout_rate)
)]
self.linear = nn.Sequential(torch.nn.Linear(n_chans, 1), nn.Softplus())
diff --git a/NeuralSeq/usr/__init__.py b/NeuralSeq/tasks/svs/__init__.py
similarity index 100%
rename from NeuralSeq/usr/__init__.py
rename to NeuralSeq/tasks/svs/__init__.py
diff --git a/NeuralSeq/usr/diffsinger_task.py b/NeuralSeq/tasks/svs/diffsinger_task.py
similarity index 98%
rename from NeuralSeq/usr/diffsinger_task.py
rename to NeuralSeq/tasks/svs/diffsinger_task.py
index ab586e1..78e6544 100644
--- a/NeuralSeq/usr/diffsinger_task.py
+++ b/NeuralSeq/tasks/svs/diffsinger_task.py
@@ -2,16 +2,16 @@ import torch
import utils
from utils.hparams import hparams
-from .diff.net import DiffNet
-from .diff.shallow_diffusion_tts import GaussianDiffusion, OfflineGaussianDiffusion
-from .diffspeech_task import DiffSpeechTask
+from modules.diff.net import DiffNet
+from modules.diff.shallow_diffusion_tts import GaussianDiffusion, OfflineGaussianDiffusion
+from tasks.svs.diffspeech_task import DiffSpeechTask
from vocoders.base_vocoder import get_vocoder_cls, BaseVocoder
from modules.fastspeech.pe import PitchExtractor
from modules.fastspeech.fs2 import FastSpeech2
from modules.diffsinger_midi.fs2 import FastSpeech2MIDI
from modules.fastspeech.tts_modules import mel2ph_to_dur
-from usr.diff.candidate_decoder import FFT
+from modules.diff.candidate_decoder import FFT
from utils.pitch_utils import denorm_f0
from tasks.tts.fs2_utils import FastSpeechDataset
from tasks.tts.fs2 import FastSpeech2Task
diff --git a/NeuralSeq/usr/diffspeech_task.py b/NeuralSeq/tasks/svs/diffspeech_task.py
similarity index 97%
rename from NeuralSeq/usr/diffspeech_task.py
rename to NeuralSeq/tasks/svs/diffspeech_task.py
index 05c313f..cf303d5 100644
--- a/NeuralSeq/usr/diffspeech_task.py
+++ b/NeuralSeq/tasks/svs/diffspeech_task.py
@@ -2,9 +2,9 @@ import torch
import utils
from utils.hparams import hparams
-from .diff.net import DiffNet
-from .diff.shallow_diffusion_tts import GaussianDiffusion
-from .task import DiffFsTask
+from modules.diff.net import DiffNet
+from modules.diff.shallow_diffusion_tts import GaussianDiffusion
+from tasks.svs.task import DiffFsTask
from vocoders.base_vocoder import get_vocoder_cls, BaseVocoder
from utils.pitch_utils import denorm_f0
from tasks.tts.fs2_utils import FastSpeechDataset
diff --git a/NeuralSeq/usr/task.py b/NeuralSeq/tasks/svs/task.py
similarity index 97%
rename from NeuralSeq/usr/task.py
rename to NeuralSeq/tasks/svs/task.py
index 3e34db2..896970e 100644
--- a/NeuralSeq/usr/task.py
+++ b/NeuralSeq/tasks/svs/task.py
@@ -1,8 +1,8 @@
import torch
import utils
-from .diff.diffusion import GaussianDiffusion
-from .diff.net import DiffNet
+from modules.diff.diffusion import GaussianDiffusion
+from modules.diff.net import DiffNet
from tasks.tts.fs2 import FastSpeech2Task
from utils.hparams import hparams
diff --git a/NeuralSeq/tasks/tts/fs2.py b/NeuralSeq/tasks/tts/fs2.py
index 2e06771..620d7f3 100644
--- a/NeuralSeq/tasks/tts/fs2.py
+++ b/NeuralSeq/tasks/tts/fs2.py
@@ -1,7 +1,5 @@
import matplotlib
-
matplotlib.use('Agg')
-
from utils import audio
import matplotlib.pyplot as plt
from data_gen.tts.data_gen_utils import get_pitch
diff --git a/audio-chatgpt.py b/audio-chatgpt.py
index 56a3ead..47884e3 100644
--- a/audio-chatgpt.py
+++ b/audio-chatgpt.py
@@ -38,7 +38,7 @@ from audio_to_text.inference_waveform import AudioCapModel
import whisper
from inference.svs.ds_e2e import DiffSingerE2EInfer
from inference.tts.GenerSpeech import GenerSpeechInfer
-from inference.tts.SyntaSpeech import TTSInference
+from inference.tts.PortaSpeech import TTSInference
from utils.hparams import set_hparams
from utils.hparams import hparams as hp
import scipy.io.wavfile as wavfile
@@ -282,7 +282,7 @@ class T2S:
print("Initializing DiffSinger to %s" % device)
self.device = device
self.exp_name = 'checkpoints/0831_opencpop_ds1000'
- self.config= 'NeuralSeq/usr/configs/midi/e2e/opencpop/ds1000.yaml'
+ self.config= 'NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000.yaml'
self.set_model_hparams()
self.pipe = DiffSingerE2EInfer(self.hp, device)
self.default_inp = {