This commit is contained in:
PeppaPiggeee
2023-04-01 14:44:30 +08:00
parent c7e6dbe0f6
commit 7a315a8492
43 changed files with 53 additions and 277 deletions

View File

@@ -1,4 +1,4 @@
task_cls: usr.task.DiffFsTask
task_cls: tasks.svs.task.DiffFsTask
pitch_type: frame
timesteps: 100
dilation_cycle_length: 1

View File

@@ -23,7 +23,7 @@ spec_max: [ -0.5982, -0.0778, 0.1205, 0.2747, 0.4657, 0.5123, 0.5684, 0.70
0.0086, -0.0698, 0.1385, 0.0941, 0.1864, 0.1225, 0.2176, 0.2566,
0.1670, 0.1007, 0.1444, 0.0888, 0.1998, 0.2414, 0.2932, 0.3047 ]
task_cls: usr.diffspeech_task.DiffSpeechTask
task_cls: tasks.svs.diffspeech_task.DiffSpeechTask
vocoder: vocoders.hifigan.HifiGAN
vocoder_ckpt: checkpoints/0414_hifi_lj_1
num_valid_plots: 10

View File

@@ -1,6 +1,6 @@
base_config:
- configs/singing/fs2.yaml
- usr/configs/midi/cascade/opencs/opencpop_statis.yaml
- egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
audio_sample_rate: 24000
hop_size: 128 # Hop size.
@@ -42,8 +42,8 @@ test_prefixes: [
'2100',
]
task_cls: usr.diffsinger_task.AuxDecoderMIDITask
#vocoder: usr.singingvocoder.highgan.HighGAN
task_cls: tasks.svs.diffsinger_task.AuxDecoderMIDITask
#vocoder: tasks.svs.singingvocoder.highgan.HighGAN
#vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl
vocoder: vocoders.hifigan.HifiGAN
vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128

View File

@@ -1,6 +1,6 @@
base_config:
- usr/configs/popcs_ds_beta6.yaml
- usr/configs/midi/cascade/opencs/opencpop_statis.yaml
- egs/egs_bases/svs/popcs_ds_beta6.yaml
- egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
binary_data_dir: 'data/binary/opencpop-midi-dp'
@@ -21,7 +21,7 @@ pe_ckpt: ''
fs2_ckpt: 'checkpoints/0302_opencpop_fs_midi/model_ckpt_steps_160000.ckpt' #
#num_valid_plots: 0
task_cls: usr.diffsinger_task.DiffSingerMIDITask
task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
K_step: 60
max_tokens: 36000

View File

@@ -1,6 +1,6 @@
base_config:
- usr/configs/popcs_ds_beta6.yaml
- usr/configs/midi/cascade/opencs/opencpop_statis.yaml
- egs/egs_bases/svs/popcs_ds_beta6.yaml
- egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
binary_data_dir: 'data/binary/opencpop-midi-dp'
@@ -17,7 +17,7 @@ dur_predictor_layers: 5 # *
fs2_ckpt: '' #
#num_valid_plots: 0
task_cls: usr.diffsinger_task.DiffSingerMIDITask
task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
timesteps: 1000
K_step: 1000

View File

@@ -1,6 +1,6 @@
base_config:
- usr/configs/popcs_ds_beta6.yaml
- usr/configs/midi/cascade/opencs/opencpop_statis.yaml
- egs/egs_bases/svs/popcs_ds_beta6.yaml
- egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
binary_data_dir: 'data/binary/opencpop-midi-dp'
@@ -17,7 +17,7 @@ dur_predictor_layers: 5 # *
fs2_ckpt: '' #
#num_valid_plots: 0
task_cls: usr.diffsinger_task.DiffSingerMIDITask
task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
# for diffusion schedule
timesteps: 1000

View File

@@ -1,6 +1,6 @@
base_config:
- usr/configs/popcs_ds_beta6.yaml
- usr/configs/midi/cascade/opencs/opencpop_statis.yaml
- egs/egs_bases/svs/popcs_ds_beta6.yaml
- egs/egs_bases/svs/midi/cascade/opencs/opencpop_statis.yaml
binarizer_cls: data_gen.singing.binarize.OpencpopBinarizer
binary_data_dir: 'data/binary/opencpop-midi-dp'
@@ -17,7 +17,7 @@ dur_predictor_layers: 5 # *
fs2_ckpt: '' #
#num_valid_plots: 0
task_cls: usr.diffsinger_task.DiffSingerMIDITask
task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
K_step: 100
max_tokens: 36000

View File

@@ -1,6 +1,6 @@
base_config:
- usr/configs/popcs_ds_beta6.yaml
- usr/configs/midi/cascade/popcs/popcs_statis.yaml
- egs/egs_bases/svs/popcs_ds_beta6.yaml
- egs/egs_bases/svs/midi/cascade/popcs/popcs_statis.yaml
binarizer_cls: data_gen.singing.binarize.MidiSingingBinarizer
binary_data_dir: 'data/binary/popcs-midi-dp'
@@ -17,7 +17,7 @@ dur_predictor_layers: 5 # *
fs2_ckpt: '' #
#num_valid_plots: 0
task_cls: usr.diffsinger_task.DiffSingerMIDITask
task_cls: tasks.svs.diffsinger_task.DiffSingerMIDITask
K_step: 100
max_tokens: 40000

View File

@@ -48,8 +48,8 @@ spec_max: [ 0.2645, 0.0583, -0.2344, -0.0184, 0.1227, 0.1533, 0.1103, 0.121
-0.8770, -0.9520, -0.8749, -0.7297, -0.8374, -0.8667, -0.7157, -0.9035,
-0.9219, -0.8801, -0.9298, -0.9009, -0.9604, -1.0537, -1.0781, -1.3766]
task_cls: usr.diffsinger_task.DiffSingerTask
#vocoder: usr.singingvocoder.highgan.HighGAN
task_cls: tasks.svs.diffsinger_task.DiffSingerTask
#vocoder: tasks.svs.singingvocoder.highgan.HighGAN
#vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl
vocoder: vocoders.hifigan.HifiGAN
vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128

View File

@@ -3,7 +3,7 @@ base_config:
fs2_ckpt: checkpoints/popcs_fs2_pmf0_1230/model_ckpt_steps_160000.ckpt # to be infer
num_valid_plots: 0
task_cls: usr.diffsinger_task.DiffSingerOfflineTask
task_cls: tasks.svs.diffsinger_task.DiffSingerOfflineTask
# tmp:
#pe_enable: true

View File

@@ -26,7 +26,7 @@ test_prefixes: [
]
task_cls: tasks.tts.fs2.FastSpeech2Task
#vocoder: usr.singingvocoder.highgan.HighGAN
#vocoder: tasks.svs.singingvocoder.highgan.HighGAN
#vocoder_ckpt: checkpoints/h_2_model/checkpoint-530000steps.pkl
vocoder: vocoders.hifigan.HifiGAN
vocoder_ckpt: checkpoints/0109_hifigan_bigpopcs_hop128

View File

@@ -2,8 +2,8 @@ import torch
from inference.svs.base_svs_infer import BaseSVSInfer
from utils import load_ckpt
from utils.hparams import hparams
from usr.diff.shallow_diffusion_tts import GaussianDiffusion
from usr.diffsinger_task import DIFF_DECODERS
from modulesmodules.diff.shallow_diffusion_tts import GaussianDiffusion
from tasks.svs.diffsinger_task import DIFF_DECODERS
class DiffSingerCascadeInfer(BaseSVSInfer):
def build_model(self):
@@ -51,4 +51,4 @@ if __name__ == '__main__':
} # input like Opencpop dataset.
DiffSingerCascadeInfer.example_run(inp)
# # CUDA_VISIBLE_DEVICES=1 python inference/svs/ds_cascade.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi
# # CUDA_VISIBLE_DEVICES=1 python inference/svs/ds_cascade.py --config egs/egs_bases/svs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi

View File

@@ -4,8 +4,8 @@ import torch
from inference.svs.base_svs_infer import BaseSVSInfer
from utils import load_ckpt
from utils.hparams import hparams
from usr.diff.shallow_diffusion_tts import GaussianDiffusion
from usr.diffsinger_task import DIFF_DECODERS
from modulesmodules.diff.shallow_diffusion_tts import GaussianDiffusion
from tasks.svs.diffsinger_task import DIFF_DECODERS
from modules.fastspeech.pe import PitchExtractor
import utils
@@ -64,4 +64,4 @@ if __name__ == '__main__':
DiffSingerE2EInfer.example_run(inp)
# CUDA_VISIBLE_DEVICES=3 python inference/svs/ds_e2e.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name 0228_opencpop_ds100_rel
# CUDA_VISIBLE_DEVICES=3 python inference/svs/ds_e2e.py --config egs/egs_bases/svs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name 0228_opencpop_ds100_rel

View File

@@ -1,27 +0,0 @@
title: 'DiffSinger'
description: |
This model is trained on 5 hours single female singing voice samples of Opencpop dataset. (该模型在开源数据集Opencpop的5小时单人歌声上训练。)
Please assign pitch and duration values to each Chinese character. The corresponding pitch and duration value of each character should be separated by a | separator. It is necessary to ensure that the note window separated by the separator is consistent with the number of Chinese characters (AP or SP is also viewed as a Chinese character). (请给每个汉字分配音高和时值, 每个字对应的音高和时值需要用|分隔符隔开。需要保证分隔符分割出来的音符窗口与汉字个数(AP或SP也算一个汉字)一致。)
You can click one of the examples to load them. (你可以点击下方示例,加载示例曲谱。)
Note: This space is running on CPU. (该Demo是在Huggingface提供的CPU上运行的, 其推理速度在本地会更快一些。)
article: |
Link to <a href='https://github.com/MoonInTheRiver/DiffSinger' style='color:blue;' target='_blank\'>Github REPO</a>
example_inputs:
- |-
你 说 你 不 SP 懂 为 何 在 这 时 牵 手 AP<sep>D#4/Eb4 | D#4/Eb4 | D#4/Eb4 | D#4/Eb4 | rest | D#4/Eb4 | D4 | D4 | D4 | D#4/Eb4 | F4 | D#4/Eb4 | D4 | rest<sep>0.113740 | 0.329060 | 0.287950 | 0.133480 | 0.150900 | 0.484730 | 0.242010 | 0.180820 | 0.343570 | 0.152050 | 0.266720 | 0.280310 | 0.633300 | 0.444590
- |-
小酒窝长睫毛AP是你最美的记号<sep>C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4<sep>0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340
- |-
我真的SP爱你SP句句不轻易<sep>D4 | A4 | F#4 | rest | A4 | D4 | rest | B4 | A4 F#4 | F#4 | A4 | A4<sep>0.8 | 0.4 | 0.967 | 0.3 | 0.4 | 0.967 | 0.4 | 0.8 | 0.4 0.4 | 0.25 | 0.967 | 0.9
- |-
好冷啊 AP 我在东北玩泥巴<sep>F4 | F4 | D4 | rest | D4 | D4 | C4 | C4 | B3 | C4 | D4<sep>0.5 | 0.3 | 0.3 | 0.3 | 0.2 | 0.2 | 0.2 | 0.2 | 0.25 | 0.25 | 0.4
#inference_cls: inference.svs.ds_cascade.DiffSingerCascadeInfer
#exp_name: 0303_opencpop_ds58_midi
inference_cls: inference.svs.ds_e2e.DiffSingerE2EInfer
exp_name: 0831_opencpop_ds1000

View File

@@ -1,91 +0,0 @@
import importlib
import re
import gradio as gr
import yaml
from gradio.inputs import Textbox
from inference.svs.base_svs_infer import BaseSVSInfer
from utils.hparams import set_hparams
from utils.hparams import hparams as hp
import numpy as np
class GradioInfer:
def __init__(self, exp_name, inference_cls, title, description, article, example_inputs):
self.exp_name = exp_name
self.title = title
self.description = description
self.article = article
self.example_inputs = example_inputs
pkg = ".".join(inference_cls.split(".")[:-1])
cls_name = inference_cls.split(".")[-1]
self.inference_cls = getattr(importlib.import_module(pkg), cls_name)
def greet(self, text, notes, notes_duration):
PUNCS = '。?;:'
sents = re.split(rf'([{PUNCS}])', text.replace('\n', ','))
sents_notes = re.split(rf'([{PUNCS}])', notes.replace('\n', ','))
sents_notes_dur = re.split(rf'([{PUNCS}])', notes_duration.replace('\n', ','))
if sents[-1] not in list(PUNCS):
sents = sents + ['']
sents_notes = sents_notes + ['']
sents_notes_dur = sents_notes_dur + ['']
audio_outs = []
s, n, n_dur = "", "", ""
for i in range(0, len(sents), 2):
if len(sents[i]) > 0:
s += sents[i] + sents[i + 1]
n += sents_notes[i] + sents_notes[i+1]
n_dur += sents_notes_dur[i] + sents_notes_dur[i+1]
if len(s) >= 400 or (i >= len(sents) - 2 and len(s) > 0):
audio_out = self.infer_ins.infer_once({
'text': s,
'notes': n,
'notes_duration': n_dur,
})
audio_out = audio_out * 32767
audio_out = audio_out.astype(np.int16)
audio_outs.append(audio_out)
audio_outs.append(np.zeros(int(hp['audio_sample_rate'] * 0.3)).astype(np.int16))
s = ""
n = ""
audio_outs = np.concatenate(audio_outs)
return hp['audio_sample_rate'], audio_outs
def run(self):
set_hparams(exp_name=self.exp_name, print_hparams=False)
infer_cls = self.inference_cls
self.infer_ins: BaseSVSInfer = infer_cls(hp)
example_inputs = self.example_inputs
for i in range(len(example_inputs)):
text, notes, notes_dur = example_inputs[i].split('<sep>')
example_inputs[i] = [text, notes, notes_dur]
iface = gr.Interface(fn=self.greet,
inputs=[
Textbox(lines=2, placeholder=None, default=example_inputs[0][0], label="input text"),
Textbox(lines=2, placeholder=None, default=example_inputs[0][1], label="input note"),
Textbox(lines=2, placeholder=None, default=example_inputs[0][2], label="input duration")]
,
outputs="audio",
allow_flagging="never",
title=self.title,
description=self.description,
article=self.article,
examples=example_inputs,
enable_queue=True)
iface.launch(share=True,)# cache_examples=True)
if __name__ == '__main__':
gradio_config = yaml.safe_load(open('inference/svs/gradio/gradio_settings.yaml'))
g = GradioInfer(**gradio_config)
g.run()
# python inference/svs/gradio/infer.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi
# python inference/svs/ds_cascade.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name 0303_opencpop_ds58_midi
# CUDA_VISIBLE_DEVICES=3 python inference/svs/gradio/infer.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name 0228_opencpop_ds100_rel

View File

@@ -1,12 +1,15 @@
import torch
import os
import importlib
from inference.tts.base_tts_infer import BaseTTSInfer
from utils.ckpt_utils import load_ckpt, get_last_checkpoint
from modules.GenerSpeech.model.generspeech import GenerSpeech
import os
from data_gen.tts.emotion import inference as EmotionEncoder
from data_gen.tts.emotion.inference import embed_utterance as Embed_utterance
from data_gen.tts.emotion.inference import preprocess_wav
from data_gen.tts.data_gen_utils import is_sil_phoneme
from resemblyzer import VoiceEncoder
from utils import audio
class GenerSpeechInfer(BaseTTSInfer):
def build_model(self):
model = GenerSpeech(self.ph_encoder)

View File

@@ -1,6 +1,3 @@
from data_gen.tts.data_gen_utils import is_sil_phoneme
from resemblyzer import VoiceEncoder
from data_gen.tts.data_gen_utils import build_phone_encoder, build_word_encoder
from tasks.tts.dataset_utils import FastSpeechWordDataset
from tasks.tts.tts_utils import load_data_preprocessor
from vocoders.hifigan import HifiGanGenerator

View File

@@ -1,13 +0,0 @@
title: 'Rongjiehuang/GenerSpeech'
description: |
Gradio demo for Rongjiehuang/GenerSpeech. To use it, simply add your audio, or click one of the examples to load them.
article: |
Link to <a href='https://github.com/Rongjiehuang/GenerSpeech' style='color:blue;' target='_blank\'>Github REPO</a>
example_inputs:
- |-
the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.
- |-
produced the block books, which were the immediate predecessors of the true printed book,
inference_cls: inference.GenerSpeech.GenerSpeechInfer
exp_name: GenerSpeech
config: modules/GenerSpeech/config/prodiff_teacher.yaml

View File

@@ -1,72 +0,0 @@
import importlib
import re
import gradio as gr
import yaml
from gradio.inputs import Textbox, Audio
from inference.base_tts_infer import BaseTTSInfer
from utils.hparams import set_hparams
from utils.hparams import hparams as hp
import numpy as np
from data_gen.tts.data_gen_utils import is_sil_phoneme, PUNCS
class GradioInfer:
def __init__(self, exp_name, config, inference_cls, title, description, article, example_inputs):
self.exp_name = exp_name
self.config = config
self.title = title
self.description = description
self.article = article
self.example_inputs = example_inputs
pkg = ".".join(inference_cls.split(".")[:-1])
cls_name = inference_cls.split(".")[-1]
self.inference_cls = getattr(importlib.import_module(pkg), cls_name)
def greet(self, text, audio):
sents = re.split(rf'([{PUNCS}])', text.replace('\n', ','))
if sents[-1] not in list(PUNCS):
sents = sents + ['.']
audio_outs = []
s = ""
for i in range(0, len(sents), 2):
if len(sents[i]) > 0:
s += sents[i] + sents[i + 1]
if len(s) >= 400 or (i >= len(sents) - 2 and len(s) > 0):
audio_out = self.infer_ins.infer_once({
'text': s,
'ref_audio': audio
})
audio_out = audio_out * 32767
audio_out = audio_out.astype(np.int16)
audio_outs.append(audio_out)
audio_outs.append(np.zeros(int(hp['audio_sample_rate'] * 0.3)).astype(np.int16))
s = ""
audio_outs = np.concatenate(audio_outs)
return hp['audio_sample_rate'], audio_outs
def run(self):
set_hparams(exp_name=self.exp_name, config=self.config)
infer_cls = self.inference_cls
self.infer_ins: BaseTTSInfer = infer_cls(hp)
example_inputs = self.example_inputs
iface = gr.Interface(fn=self.greet,
inputs=[
Textbox(lines=10, placeholder=None, default=example_inputs[0], label="input text"),
Audio(label="reference audio"),
],
outputs="audio",
allow_flagging="never",
title=self.title,
description=self.description,
article=self.article,
examples=example_inputs,
enable_queue=True)
iface.launch()
if __name__ == '__main__':
gradio_config = yaml.safe_load(open('inference/gradio/gradio_settings.yaml'))
g = GradioInfer(**gradio_config)
g.run()

View File

@@ -76,26 +76,6 @@ def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False)
pass
return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
class LayerNorm_(torch.nn.LayerNorm):
"""Layer normalization module.
:param int nout: output dim size
:param int dim: dimension to be normalized
"""
def __init__(self, nout, dim=-1, eps=1e-5):
"""Construct an LayerNorm object."""
super(LayerNorm_, self).__init__(nout, eps=eps)
self.dim = dim
def forward(self, x):
"""Apply layer normalization.
:param torch.Tensor x: input tensor
:return: layer normalized tensor
:rtype torch.Tensor
"""
if self.dim == -1:
return super(LayerNorm_, self).forward(x)
return super(LayerNorm_, self).forward(x.transpose(1, -1)).transpose(1, -1)
def Linear(in_features, out_features, bias=True):
m = nn.Linear(in_features, out_features, bias)
nn.init.xavier_uniform_(m.weight)

View File

@@ -3,7 +3,8 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
from modules.commons.common_layers import LayerNorm_, Embedding
from modules.commons.common_layers import Embedding
from modules.fastspeech.tts_modules import LayerNorm
class LambdaLayer(nn.Module):
@@ -35,7 +36,7 @@ class ResidualBlock(nn.Module):
elif norm_type == 'gn':
norm_builder = lambda: nn.GroupNorm(8, channels)
elif norm_type == 'ln':
norm_builder = lambda: LayerNorm_(channels, eps=ln_eps)
norm_builder = lambda: LayerNorm(channels, dim=1, eps=ln_eps)
else:
norm_builder = lambda: nn.Identity()
@@ -89,7 +90,7 @@ class ConvBlocks(nn.Module):
elif norm_type == 'gn':
norm = nn.GroupNorm(8, hidden_size)
elif norm_type == 'ln':
norm = LayerNorm_(hidden_size, eps=ln_eps)
norm = LayerNorm(hidden_size, dim=1, eps=ln_eps)
self.last_norm = norm
self.post_net1 = nn.Conv1d(hidden_size, out_dims, kernel_size=post_net_kernel,
padding=post_net_kernel // 2)

View File

@@ -3,8 +3,8 @@ import torch
from torch import nn
import torch.nn.functional as F
from modules.commons.conv import TextConvEncoder, ConvBlocks
from modules.commons.common_layers import Embedding, LayerNorm_
from modules.fastspeech.tts_modules import PitchPredictor, LengthRegulator
from modules.commons.common_layers import Embedding
from modules.fastspeech.tts_modules import LayerNorm, PitchPredictor, LengthRegulator
from modules.commons.rel_transformer import RelTransformerEncoder, BERTRelTransformerEncoder
from modules.commons.align_ops import clip_mel2token_to_multiple, expand_states
from utils.pitch_utils import denorm_f0, f0_to_coarse
@@ -34,7 +34,7 @@ class DurationPredictor(torch.nn.Module):
self.conv += [torch.nn.Sequential(
torch.nn.Conv1d(in_chans, n_chans, kernel_size, stride=1, padding=kernel_size // 2),
torch.nn.ReLU(),
LayerNorm_(n_chans, dim=1),
LayerNorm(n_chans, dim=1),
torch.nn.Dropout(dropout_rate)
)]
self.linear = nn.Sequential(torch.nn.Linear(n_chans, 1), nn.Softplus())

View File

@@ -2,16 +2,16 @@ import torch
import utils
from utils.hparams import hparams
from .diff.net import DiffNet
from .diff.shallow_diffusion_tts import GaussianDiffusion, OfflineGaussianDiffusion
from .diffspeech_task import DiffSpeechTask
from modules.diff.net import DiffNet
from modules.diff.shallow_diffusion_tts import GaussianDiffusion, OfflineGaussianDiffusion
from tasks.svs.diffspeech_task import DiffSpeechTask
from vocoders.base_vocoder import get_vocoder_cls, BaseVocoder
from modules.fastspeech.pe import PitchExtractor
from modules.fastspeech.fs2 import FastSpeech2
from modules.diffsinger_midi.fs2 import FastSpeech2MIDI
from modules.fastspeech.tts_modules import mel2ph_to_dur
from usr.diff.candidate_decoder import FFT
from modules.diff.candidate_decoder import FFT
from utils.pitch_utils import denorm_f0
from tasks.tts.fs2_utils import FastSpeechDataset
from tasks.tts.fs2 import FastSpeech2Task

View File

@@ -2,9 +2,9 @@ import torch
import utils
from utils.hparams import hparams
from .diff.net import DiffNet
from .diff.shallow_diffusion_tts import GaussianDiffusion
from .task import DiffFsTask
from modules.diff.net import DiffNet
from modules.diff.shallow_diffusion_tts import GaussianDiffusion
from tasks.svs.task import DiffFsTask
from vocoders.base_vocoder import get_vocoder_cls, BaseVocoder
from utils.pitch_utils import denorm_f0
from tasks.tts.fs2_utils import FastSpeechDataset

View File

@@ -1,8 +1,8 @@
import torch
import utils
from .diff.diffusion import GaussianDiffusion
from .diff.net import DiffNet
from modules.diff.diffusion import GaussianDiffusion
from modules.diff.net import DiffNet
from tasks.tts.fs2 import FastSpeech2Task
from utils.hparams import hparams

View File

@@ -1,7 +1,5 @@
import matplotlib
matplotlib.use('Agg')
from utils import audio
import matplotlib.pyplot as plt
from data_gen.tts.data_gen_utils import get_pitch

View File

@@ -38,7 +38,7 @@ from audio_to_text.inference_waveform import AudioCapModel
import whisper
from inference.svs.ds_e2e import DiffSingerE2EInfer
from inference.tts.GenerSpeech import GenerSpeechInfer
from inference.tts.SyntaSpeech import TTSInference
from inference.tts.PortaSpeech import TTSInference
from utils.hparams import set_hparams
from utils.hparams import hparams as hp
import scipy.io.wavfile as wavfile
@@ -282,7 +282,7 @@ class T2S:
print("Initializing DiffSinger to %s" % device)
self.device = device
self.exp_name = 'checkpoints/0831_opencpop_ds1000'
self.config= 'NeuralSeq/usr/configs/midi/e2e/opencpop/ds1000.yaml'
self.config= 'NeuralSeq/egs/egs_bases/svs/midi/e2e/opencpop/ds1000.yaml'
self.set_model_hparams()
self.pipe = DiffSingerE2EInfer(self.hp, device)
self.default_inp = {