update t2s

This commit is contained in:
PeppaPiggeee
2023-03-25 21:45:49 +08:00
parent 1eaf0ae017
commit 8de9ce6595
23 changed files with 11 additions and 28220 deletions

2
.gitignore vendored
View File

@@ -4,7 +4,7 @@
.circleci/ .circleci/
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ *__pycache__/
*.py[cod] *.py[cod]
*$py.class *$py.class

View File

@@ -17,7 +17,14 @@ Input Example : Generate the audio of this image<br />
Output:<br /> Output:<br />
![](i2a-2.png)<br /> ![](i2a-2.png)<br />
## ASR ## ASR
First uploag your audio(.wav)<br /> First upload your audio(.wav)<br />
Input Example : Generate the text of this audio<br /> Input Example : Generate the text of this audio<br />
Output:<br /> Output:<br />
![](asr.png)<br /> ![](asr.png)<br />
## Style Transfer Text-To-Speech
First upload your audio(.wav)<br />
Input Example : Speak using the voice of this audio. The text is "here we go".<br />
Output:<br />
![](style_transfer_tts.png)<br />

Binary file not shown.

After

Width:  |  Height:  |  Size: 785 KiB

View File

@@ -517,8 +517,8 @@ class ConversationBot:
Tool( Tool(
name="Generate speech with unseen style derived from a reference audio acoustic reference from user input text and save it to a file", func= self.tts_ood.inference, name="Generate speech with unseen style derived from a reference audio acoustic reference from user input text and save it to a file", func= self.tts_ood.inference,
description="useful for when you want to generate high-quality speech samples with unseen styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice." description="useful for when you want to generate high-quality speech samples with unseen styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
"Like: generate a speech with unseen style derived from this custom voice. The text is xxx." "Like: Generate a speech with unseen style derived from this custom voice. The text is xxx."
"Or speak using the voice of this audio. The text is xxx." "Or Speak using the voice of this audio. The text is xxx."
"The input to this tool should be a comma seperated string of two, representing reference audio path and input text."), "The input to this tool should be a comma seperated string of two, representing reference audio path and input text."),
Tool(name="Generate singing voice From User Input Text, Note and Duration Sequence", func= self.t2s.inference, Tool(name="Generate singing voice From User Input Text, Note and Duration Sequence", func= self.t2s.inference,
description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) and save it to a file." description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) and save it to a file."

View File

@@ -1,77 +0,0 @@
! !
, ,
. .
; ;
<BOS> <BOS>
<EOS> <EOS>
? ?
AA0 AA0
AA1 AA1
AA2 AA2
AE0 AE0
AE1 AE1
AE2 AE2
AH0 AH0
AH1 AH1
AH2 AH2
AO0 AO0
AO1 AO1
AO2 AO2
AW0 AW0
AW1 AW1
AW2 AW2
AY0 AY0
AY1 AY1
AY2 AY2
B B
CH CH
D D
DH DH
EH0 EH0
EH1 EH1
EH2 EH2
ER0 ER0
ER1 ER1
ER2 ER2
EY0 EY0
EY1 EY1
EY2 EY2
F F
G G
HH HH
IH0 IH0
IH1 IH1
IH2 IH2
IY0 IY0
IY1 IY1
IY2 IY2
JH JH
K K
L L
M M
N N
NG NG
OW0 OW0
OW1 OW1
OW2 OW2
OY0 OY0
OY1 OY1
OY2 OY2
P P
R R
S S
SH SH
T T
TH TH
UH0 UH0
UH1 UH1
UH2 UH2
UW0 UW0
UW1 UW1
UW2 UW2
V V
W W
Y Y
Z Z
ZH ZH
| |

File diff suppressed because it is too large Load Diff

View File

@@ -1 +0,0 @@
["!", ",", ".", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"]

View File

@@ -1,398 +0,0 @@
import os
import random
from copy import deepcopy
import pandas as pd
import logging
from tqdm import tqdm
import json
import glob
import re
from resemblyzer import VoiceEncoder
import traceback
import numpy as np
import pretty_midi
import librosa
from scipy.interpolate import interp1d
import torch
from textgrid import TextGrid
from utils.hparams import hparams
from data_gen.tts.data_gen_utils import build_phone_encoder, get_pitch
from utils.pitch_utils import f0_to_coarse
from data_gen.tts.base_binarizer import BaseBinarizer, BinarizationError
from data_gen.tts.binarizer_zh import ZhBinarizer
from data_gen.tts.txt_processors.zh_g2pM import ALL_YUNMU
from vocoders.base_vocoder import VOCODERS
class SingingBinarizer(BaseBinarizer):
def __init__(self, processed_data_dir=None):
if processed_data_dir is None:
processed_data_dir = hparams['processed_data_dir']
self.processed_data_dirs = processed_data_dir.split(",")
self.binarization_args = hparams['binarization_args']
self.pre_align_args = hparams['pre_align_args']
self.item2txt = {}
self.item2ph = {}
self.item2wavfn = {}
self.item2f0fn = {}
self.item2tgfn = {}
self.item2spk = {}
def split_train_test_set(self, item_names):
item_names = deepcopy(item_names)
test_item_names = [x for x in item_names if any([ts in x for ts in hparams['test_prefixes']])]
train_item_names = [x for x in item_names if x not in set(test_item_names)]
logging.info("train {}".format(len(train_item_names)))
logging.info("test {}".format(len(test_item_names)))
return train_item_names, test_item_names
def load_meta_data(self):
for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
wav_suffix = '_wf0.wav'
txt_suffix = '.txt'
ph_suffix = '_ph.txt'
tg_suffix = '.TextGrid'
all_wav_pieces = glob.glob(f'{processed_data_dir}/*/*{wav_suffix}')
for piece_path in all_wav_pieces:
item_name = raw_item_name = piece_path[len(processed_data_dir)+1:].replace('/', '-')[:-len(wav_suffix)]
if len(self.processed_data_dirs) > 1:
item_name = f'ds{ds_id}_{item_name}'
self.item2txt[item_name] = open(f'{piece_path.replace(wav_suffix, txt_suffix)}').readline()
self.item2ph[item_name] = open(f'{piece_path.replace(wav_suffix, ph_suffix)}').readline()
self.item2wavfn[item_name] = piece_path
self.item2spk[item_name] = re.split('-|#', piece_path.split('/')[-2])[0]
if len(self.processed_data_dirs) > 1:
self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
self.item2tgfn[item_name] = piece_path.replace(wav_suffix, tg_suffix)
print('spkers: ', set(self.item2spk.values()))
self.item_names = sorted(list(self.item2txt.keys()))
if self.binarization_args['shuffle']:
random.seed(1234)
random.shuffle(self.item_names)
self._train_item_names, self._test_item_names = self.split_train_test_set(self.item_names)
@property
def train_item_names(self):
return self._train_item_names
@property
def valid_item_names(self):
return self._test_item_names
@property
def test_item_names(self):
return self._test_item_names
def process(self):
self.load_meta_data()
os.makedirs(hparams['binary_data_dir'], exist_ok=True)
self.spk_map = self.build_spk_map()
print("| spk_map: ", self.spk_map)
spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
json.dump(self.spk_map, open(spk_map_fn, 'w'))
self.phone_encoder = self._phone_encoder()
self.process_data('valid')
self.process_data('test')
self.process_data('train')
def _phone_encoder(self):
ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
ph_set = []
if hparams['reset_phone_dict'] or not os.path.exists(ph_set_fn):
for ph_sent in self.item2ph.values():
ph_set += ph_sent.split(' ')
ph_set = sorted(set(ph_set))
json.dump(ph_set, open(ph_set_fn, 'w'))
print("| Build phone set: ", ph_set)
else:
ph_set = json.load(open(ph_set_fn, 'r'))
print("| Load phone set: ", ph_set)
return build_phone_encoder(hparams['binary_data_dir'])
# @staticmethod
# def get_pitch(wav_fn, spec, res):
# wav_suffix = '_wf0.wav'
# f0_suffix = '_f0.npy'
# f0fn = wav_fn.replace(wav_suffix, f0_suffix)
# pitch_info = np.load(f0fn)
# f0 = [x[1] for x in pitch_info]
# spec_x_coor = np.arange(0, 1, 1 / len(spec))[:len(spec)]
# f0_x_coor = np.arange(0, 1, 1 / len(f0))[:len(f0)]
# f0 = interp1d(f0_x_coor, f0, 'nearest', fill_value='extrapolate')(spec_x_coor)[:len(spec)]
# # f0_x_coor = np.arange(0, 1, 1 / len(f0))
# # f0_x_coor[-1] = 1
# # f0 = interp1d(f0_x_coor, f0, 'nearest')(spec_x_coor)[:len(spec)]
# if sum(f0) == 0:
# raise BinarizationError("Empty f0")
# assert len(f0) == len(spec), (len(f0), len(spec))
# pitch_coarse = f0_to_coarse(f0)
#
# # vis f0
# # import matplotlib.pyplot as plt
# # from textgrid import TextGrid
# # tg_fn = wav_fn.replace(wav_suffix, '.TextGrid')
# # fig = plt.figure(figsize=(12, 6))
# # plt.pcolor(spec.T, vmin=-5, vmax=0)
# # ax = plt.gca()
# # ax2 = ax.twinx()
# # ax2.plot(f0, color='red')
# # ax2.set_ylim(0, 800)
# # itvs = TextGrid.fromFile(tg_fn)[0]
# # for itv in itvs:
# # x = itv.maxTime * hparams['audio_sample_rate'] / hparams['hop_size']
# # plt.vlines(x=x, ymin=0, ymax=80, color='black')
# # plt.text(x=x, y=20, s=itv.mark, color='black')
# # plt.savefig('tmp/20211229_singing_plots_test.png')
#
# res['f0'] = f0
# res['pitch'] = pitch_coarse
@classmethod
def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
if hparams['vocoder'] in VOCODERS:
wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
else:
wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
res = {
'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
}
try:
if binarization_args['with_f0']:
# cls.get_pitch(wav_fn, mel, res)
cls.get_pitch(wav, mel, res)
if binarization_args['with_txt']:
try:
# print(ph)
phone_encoded = res['phone'] = encoder.encode(ph)
except:
traceback.print_exc()
raise BinarizationError(f"Empty phoneme")
if binarization_args['with_align']:
cls.get_align(tg_fn, ph, mel, phone_encoded, res)
except BinarizationError as e:
print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
return None
return res
class MidiSingingBinarizer(SingingBinarizer):
item2midi = {}
item2midi_dur = {}
item2is_slur = {}
item2ph_durs = {}
item2wdb = {}
def load_meta_data(self):
for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
meta_midi = json.load(open(os.path.join(processed_data_dir, 'meta.json'))) # [list of dict]
for song_item in meta_midi:
item_name = raw_item_name = song_item['item_name']
if len(self.processed_data_dirs) > 1:
item_name = f'ds{ds_id}_{item_name}'
self.item2wavfn[item_name] = song_item['wav_fn']
self.item2txt[item_name] = song_item['txt']
self.item2ph[item_name] = ' '.join(song_item['phs'])
self.item2wdb[item_name] = [1 if x in ALL_YUNMU + ['AP', 'SP', '<SIL>'] else 0 for x in song_item['phs']]
self.item2ph_durs[item_name] = song_item['ph_dur']
self.item2midi[item_name] = song_item['notes']
self.item2midi_dur[item_name] = song_item['notes_dur']
self.item2is_slur[item_name] = song_item['is_slur']
self.item2spk[item_name] = 'pop-cs'
if len(self.processed_data_dirs) > 1:
self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
print('spkers: ', set(self.item2spk.values()))
self.item_names = sorted(list(self.item2txt.keys()))
if self.binarization_args['shuffle']:
random.seed(1234)
random.shuffle(self.item_names)
self._train_item_names, self._test_item_names = self.split_train_test_set(self.item_names)
@staticmethod
def get_pitch(wav_fn, wav, spec, ph, res):
wav_suffix = '.wav'
# midi_suffix = '.mid'
wav_dir = 'wavs'
f0_dir = 'f0'
item_name = '/'.join(os.path.splitext(wav_fn)[0].split('/')[-2:]).replace('_wf0', '')
res['pitch_midi'] = np.asarray(MidiSingingBinarizer.item2midi[item_name])
res['midi_dur'] = np.asarray(MidiSingingBinarizer.item2midi_dur[item_name])
res['is_slur'] = np.asarray(MidiSingingBinarizer.item2is_slur[item_name])
res['word_boundary'] = np.asarray(MidiSingingBinarizer.item2wdb[item_name])
assert res['pitch_midi'].shape == res['midi_dur'].shape == res['is_slur'].shape, (
res['pitch_midi'].shape, res['midi_dur'].shape, res['is_slur'].shape)
# gt f0.
gt_f0, gt_pitch_coarse = get_pitch(wav, spec, hparams)
if sum(gt_f0) == 0:
raise BinarizationError("Empty **gt** f0")
res['f0'] = gt_f0
res['pitch'] = gt_pitch_coarse
@staticmethod
def get_align(ph_durs, mel, phone_encoded, res, hop_size=hparams['hop_size'], audio_sample_rate=hparams['audio_sample_rate']):
mel2ph = np.zeros([mel.shape[0]], int)
startTime = 0
for i_ph in range(len(ph_durs)):
start_frame = int(startTime * audio_sample_rate / hop_size + 0.5)
end_frame = int((startTime + ph_durs[i_ph]) * audio_sample_rate / hop_size + 0.5)
mel2ph[start_frame:end_frame] = i_ph + 1
startTime = startTime + ph_durs[i_ph]
# print('ph durs: ', ph_durs)
# print('mel2ph: ', mel2ph, len(mel2ph))
res['mel2ph'] = mel2ph
# res['dur'] = None
@classmethod
def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
if hparams['vocoder'] in VOCODERS:
wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
else:
wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
res = {
'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
}
try:
if binarization_args['with_f0']:
cls.get_pitch(wav_fn, wav, mel, ph, res)
if binarization_args['with_txt']:
try:
phone_encoded = res['phone'] = encoder.encode(ph)
except:
traceback.print_exc()
raise BinarizationError(f"Empty phoneme")
if binarization_args['with_align']:
cls.get_align(MidiSingingBinarizer.item2ph_durs[item_name], mel, phone_encoded, res)
except BinarizationError as e:
print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
return None
return res
class ZhSingingBinarizer(ZhBinarizer, SingingBinarizer):
pass
class OpencpopBinarizer(MidiSingingBinarizer):
item2midi = {}
item2midi_dur = {}
item2is_slur = {}
item2ph_durs = {}
item2wdb = {}
def split_train_test_set(self, item_names):
item_names = deepcopy(item_names)
test_item_names = [x for x in item_names if any([x.startswith(ts) for ts in hparams['test_prefixes']])]
train_item_names = [x for x in item_names if x not in set(test_item_names)]
logging.info("train {}".format(len(train_item_names)))
logging.info("test {}".format(len(test_item_names)))
return train_item_names, test_item_names
def load_meta_data(self):
raw_data_dir = hparams['raw_data_dir']
# meta_midi = json.load(open(os.path.join(raw_data_dir, 'meta.json'))) # [list of dict]
utterance_labels = open(os.path.join(raw_data_dir, 'transcriptions.txt')).readlines()
for utterance_label in utterance_labels:
song_info = utterance_label.split('|')
item_name = raw_item_name = song_info[0]
self.item2wavfn[item_name] = f'{raw_data_dir}/wavs/{item_name}.wav'
self.item2txt[item_name] = song_info[1]
self.item2ph[item_name] = song_info[2]
# self.item2wdb[item_name] = list(np.nonzero([1 if x in ALL_YUNMU + ['AP', 'SP'] else 0 for x in song_info[2].split()])[0])
self.item2wdb[item_name] = [1 if x in ALL_YUNMU + ['AP', 'SP'] else 0 for x in song_info[2].split()]
self.item2ph_durs[item_name] = [float(x) for x in song_info[5].split(" ")]
self.item2midi[item_name] = [librosa.note_to_midi(x.split("/")[0]) if x != 'rest' else 0
for x in song_info[3].split(" ")]
self.item2midi_dur[item_name] = [float(x) for x in song_info[4].split(" ")]
self.item2is_slur[item_name] = [int(x) for x in song_info[6].split(" ")]
self.item2spk[item_name] = 'opencpop'
print('spkers: ', set(self.item2spk.values()))
self.item_names = sorted(list(self.item2txt.keys()))
if self.binarization_args['shuffle']:
random.seed(1234)
random.shuffle(self.item_names)
self._train_item_names, self._test_item_names = self.split_train_test_set(self.item_names)
@staticmethod
def get_pitch(wav_fn, wav, spec, ph, res):
wav_suffix = '.wav'
# midi_suffix = '.mid'
wav_dir = 'wavs'
f0_dir = 'text_f0_align'
item_name = os.path.splitext(os.path.basename(wav_fn))[0]
res['pitch_midi'] = np.asarray(OpencpopBinarizer.item2midi[item_name])
res['midi_dur'] = np.asarray(OpencpopBinarizer.item2midi_dur[item_name])
res['is_slur'] = np.asarray(OpencpopBinarizer.item2is_slur[item_name])
res['word_boundary'] = np.asarray(OpencpopBinarizer.item2wdb[item_name])
assert res['pitch_midi'].shape == res['midi_dur'].shape == res['is_slur'].shape, (res['pitch_midi'].shape, res['midi_dur'].shape, res['is_slur'].shape)
# gt f0.
# f0 = None
# f0_suffix = '_f0.npy'
# f0fn = wav_fn.replace(wav_suffix, f0_suffix).replace(wav_dir, f0_dir)
# pitch_info = np.load(f0fn)
# f0 = [x[1] for x in pitch_info]
# spec_x_coor = np.arange(0, 1, 1 / len(spec))[:len(spec)]
#
# f0_x_coor = np.arange(0, 1, 1 / len(f0))[:len(f0)]
# f0 = interp1d(f0_x_coor, f0, 'nearest', fill_value='extrapolate')(spec_x_coor)[:len(spec)]
# if sum(f0) == 0:
# raise BinarizationError("Empty **gt** f0")
#
# pitch_coarse = f0_to_coarse(f0)
# res['f0'] = f0
# res['pitch'] = pitch_coarse
# gt f0.
gt_f0, gt_pitch_coarse = get_pitch(wav, spec, hparams)
if sum(gt_f0) == 0:
raise BinarizationError("Empty **gt** f0")
res['f0'] = gt_f0
res['pitch'] = gt_pitch_coarse
@classmethod
def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
if hparams['vocoder'] in VOCODERS:
wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
else:
wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
res = {
'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
}
try:
if binarization_args['with_f0']:
cls.get_pitch(wav_fn, wav, mel, ph, res)
if binarization_args['with_txt']:
try:
phone_encoded = res['phone'] = encoder.encode(ph)
except:
traceback.print_exc()
raise BinarizationError(f"Empty phoneme")
if binarization_args['with_align']:
cls.get_align(OpencpopBinarizer.item2ph_durs[item_name], mel, phone_encoded, res)
except BinarizationError as e:
print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
return None
return res
if __name__ == "__main__":
SingingBinarizer().process()

View File

@@ -1,20 +0,0 @@
import os
os.environ["OMP_NUM_THREADS"] = "1"
import importlib
from utils.hparams import set_hparams, hparams
def binarize():
binarizer_cls = hparams.get("binarizer_cls", 'data_gen.tts.base_binarizer.BaseBinarizer')
pkg = ".".join(binarizer_cls.split(".")[:-1])
cls_name = binarizer_cls.split(".")[-1]
binarizer_cls = getattr(importlib.import_module(pkg), cls_name)
print("| Binarizer: ", binarizer_cls)
binarizer_cls().process()
if __name__ == '__main__':
set_hparams()
binarize()

View File

@@ -1,17 +0,0 @@
from utils.hparams import set_hparams, hparams
import importlib
def preprocess():
assert hparams['preprocess_cls'] != ''
pkg = ".".join(hparams["preprocess_cls"].split(".")[:-1])
cls_name = hparams["preprocess_cls"].split(".")[-1]
process_cls = getattr(importlib.import_module(pkg), cls_name)
process_cls().process()
if __name__ == '__main__':
set_hparams()
preprocess()

View File

@@ -1,15 +0,0 @@
import subprocess
from utils.hparams import hparams, set_hparams
import os
def train_mfa_align():
CORPUS = hparams['processed_data_dir'].split("/")[-1]
print(f"| Run MFA for {CORPUS}.")
NUM_JOB = int(os.getenv('N_PROC', os.cpu_count()))
subprocess.check_call(f'CORPUS={CORPUS} NUM_JOB={NUM_JOB} bash data_gen/tts/scripts/run_mfa_train_align.sh', shell=True)
if __name__ == '__main__':
set_hparams(print_hparams=False)
train_mfa_align()

View File

@@ -1,111 +0,0 @@
# DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
[![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
[![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
## DiffSinger (MIDI SVS | A version)
### 0. Data Acquirement
For Opencpop dataset: Please strictly follow the instructions of [Opencpop](https://wenet.org.cn/opencpop/). We have no right to give you the access to Opencpop.
The pipeline below is designed for Opencpop dataset:
### 1. Preparation
#### Data Preparation
a) Download and extract Opencpop, then create a link to the dataset folder: `ln -s /xxx/opencpop data/raw/`
b) Run the following scripts to pack the dataset for training/inference.
```sh
export PYTHONPATH=.
CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml
# `data/binary/opencpop-midi-dp` will be generated.
```
#### Vocoder Preparation
We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
Please unzip this file into `checkpoints` before training your acoustic model.
(Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder.
#### Exp Name Preparation
```bash
export MY_FS_EXP_NAME=0302_opencpop_fs_midi
export MY_DS_EXP_NAME=0303_opencpop_ds58_midi
```
```
.
|--data
|--raw
|--opencpop
|--segments
|--transcriptions.txt
|--wavs
|--checkpoints
|--MY_FS_EXP_NAME (optional)
|--MY_DS_EXP_NAME (optional)
|--0109_hifigan_bigpopcs_hop128
|--model_ckpt_steps_1512000.ckpt
|--config.yaml
```
### 2. Training Example
First, you need a pre-trained FFT-Singer checkpoint. You can use the pre-trained model, or train FFT-Singer from scratch, run:
```sh
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml --exp_name $MY_FS_EXP_NAME --reset
```
Then, to train DiffSinger, run:
```sh
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME --reset
```
Remember to adjust the "fs2_ckpt" parameter in `usr/configs/midi/cascade/opencs/ds60_rel.yaml` to fit your path.
### 3. Inference from packed test set
```sh
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME --reset --infer
```
We also provide:
- the pre-trained model of DiffSinger;
- the pre-trained model of FFT-Singer;
They can be found in [here](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/adjust-receptive-field.zip).
Remember to put the pre-trained models in `checkpoints` directory.
### 4. Inference from raw inputs
```sh
python inference/svs/ds_cascade.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME
```
Raw inputs:
```
inp = {
'text': '小酒窝长睫毛AP是你最美的记号',
'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
'input_type': 'word'
} # user input: Chinese characters
or,
inp = {
'text': '小酒窝长睫毛AP是你最美的记号',
'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
'input_type': 'phoneme'
} # input like Opencpop dataset.
```
### 5. Some issues.
a) the HifiGAN-Singing is trained on our [vocoder dataset](https://dl.acm.org/doi/abs/10.1145/3474085.3475437) and the training set of [PopCS](https://arxiv.org/abs/2105.02446). Opencpop is the out-of-domain dataset (unseen speaker). This may cause the deterioration of audio quality, and we are considering fine-tuning this vocoder on the training set of Opencpop.
b) in this version of codes, we used the melody frontend ([lyric + MIDI]->[F0+ph_dur]) to predict F0 contour and phoneme duration.
c) generated audio demos can be found in [MY_DS_EXP_NAME](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/adjust-receptive-field.zip).

View File

@@ -1,107 +0,0 @@
# DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
[![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
[![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
| [Interactive🤗 SVS](https://huggingface.co/spaces/Silentlin/DiffSinger)
Substantial update: We 1) **abandon** the explicit prediction of the F0 curve; 2) increase the receptive field of the denoiser; 3) make the linguistic encoder more robust.
**By doing so, 1) the synthesized recordings are more natural in terms of pitch; 2) the pipeline is simpler.**
简而言之把F0曲线的动态性交给生成式模型去捕捉而不再是以前那样用MSE约束对数域F0。
## DiffSinger (MIDI SVS | B version)
### 0. Data Acquirement
For Opencpop dataset: Please strictly follow the instructions of [Opencpop](https://wenet.org.cn/opencpop/). We have no right to give you the access to Opencpop.
The pipeline below is designed for Opencpop dataset:
### 1. Preparation
#### Data Preparation
a) Download and extract Opencpop, then create a link to the dataset folder: `ln -s /xxx/opencpop data/raw/`
b) Run the following scripts to pack the dataset for training/inference.
```sh
export PYTHONPATH=.
CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml
# `data/binary/opencpop-midi-dp` will be generated.
```
#### Vocoder Preparation
We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
Also, please unzip pre-trained vocoder and [this pendant for vocoder](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0102_xiaoma_pe.zip) into `checkpoints` before training your acoustic model.
(Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder.
#### Exp Name Preparation
```bash
export MY_DS_EXP_NAME=0228_opencpop_ds100_rel
```
```
.
|--data
|--raw
|--opencpop
|--segments
|--transcriptions.txt
|--wavs
|--checkpoints
|--MY_DS_EXP_NAME (optional)
|--0109_hifigan_bigpopcs_hop128 (vocoder)
|--model_ckpt_steps_1512000.ckpt
|--config.yaml
```
### 2. Training Example
```sh
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME --reset
```
### 3. Inference from packed test set
```sh
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME --reset --infer
```
We also provide:
- the pre-trained model of DiffSinger;
They can be found in [here](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0228_opencpop_ds100_rel.zip).
Remember to put the pre-trained models in `checkpoints` directory.
### 4. Inference from raw inputs
```sh
python inference/svs/ds_e2e.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME
```
Raw inputs:
```
inp = {
'text': '小酒窝长睫毛AP是你最美的记号',
'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
'input_type': 'word'
} # user input: Chinese characters
or,
inp = {
'text': '小酒窝长睫毛AP是你最美的记号',
'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
'input_type': 'phoneme'
} # input like Opencpop dataset.
```
### 5. Some issues.
a) the HifiGAN-Singing is trained on our [vocoder dataset](https://dl.acm.org/doi/abs/10.1145/3474085.3475437) and the training set of [PopCS](https://arxiv.org/abs/2105.02446). Opencpop is the out-of-domain dataset (unseen speaker). This may cause the deterioration of audio quality, and we are considering fine-tuning this vocoder on the training set of Opencpop.
b) in this version of codes, we used the melody frontend ([lyric + MIDI]->[ph_dur]) to predict phoneme duration. F0 curve is implicitly predicted together with mel-spectrogram.
c) example [generated audio](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/demos_0221/DS/).
More generated audio demos can be found in [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0228_opencpop_ds100_rel.zip).

View File

@@ -1,63 +0,0 @@
## DiffSinger (SVS version)
### 0. Data Acquirement
- See in [apply_form](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md).
- Dataset [preview](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_preview.zip).
### 1. Preparation
#### Data Preparation
a) Download and extract PopCS, then create a link to the dataset folder: `ln -s /xxx/popcs/ data/processed/popcs`
b) Run the following scripts to pack the dataset for training/inference.
```sh
export PYTHONPATH=.
CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/popcs_ds_beta6.yaml
# `data/binary/popcs-pmf0` will be generated.
```
#### Vocoder Preparation
We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
Please unzip this file into `checkpoints` before training your acoustic model.
(Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder.
### 2. Training Example
First, you need a pre-trained FFT-Singer checkpoint. You can use the [pre-trained model](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip), or train FFT-Singer from scratch, run:
```sh
# First, train fft-singer;
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset
# Then, infer fft-singer;
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset --infer
```
Then, to train DiffSinger, run:
```sh
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset
```
Remember to adjust the "fs2_ckpt" parameter in `usr/configs/popcs_ds_beta6_offline.yaml` to fit your path.
### 3. Inference Example
```sh
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset --infer
```
We also provide:
- the pre-trained model of [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_ds_beta6_offline_pmf0_1230.zip);
- the pre-trained model of [FFT-Singer](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip) for the shallow diffusion mechanism in DiffSinger;
Remember to put the pre-trained models in `checkpoints` directory.
*Note that:*
- *the original PWG version vocoder in the paper we used has been put into commercial use, so we provide this HifiGAN version vocoder as a substitute.*
- *we assume the ground-truth F0 to be given as the pitch information following [1][2][3]. If you want to conduct experiments on MIDI data, you need an external F0 predictor (like [MIDI-A-version](README-SVS-opencpop-cascade.md)) or a joint prediction with spectrograms(like [MIDI-B-version](README-SVS-opencpop-e2e.md)).*
[1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020.
[2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020.
[3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020.

View File

@@ -1,76 +0,0 @@
# DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
[![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
[![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
| [Interactive🤗 SVS](https://huggingface.co/spaces/Silentlin/DiffSinger)
## DiffSinger (SVS)
### PART1. [Run DiffSinger on PopCS](README-SVS-popcs.md)
In PART1, we only focus on spectrum modeling (acoustic model) and assume the ground-truth (GT) F0 to be given as the pitch information following these papers [1][2][3]. If you want to conduct experiments with F0 prediction, please move to PART2.
Thus, the pipeline of this part can be summarized as:
```
[lyrics] -> [linguistic representation] (Frontend)
[linguistic representation] + [GT F0] + [GT phoneme duration] -> [mel-spectrogram] (Acoustic model)
[mel-spectrogram] + [GT F0] -> [waveform] (Vocoder)
```
[1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020.
[2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020.
[3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020.
Click here for detailed instructions: [link](README-SVS-popcs.md).
### PART2. [Run DiffSinger on Opencpop](README-SVS-opencpop-cascade.md)
Thanks [Opencpop team](https://wenet.org.cn/opencpop/) for releasing their SVS dataset with MIDI label, **Jan.20, 2022** (after we published our paper).
Since there are elaborately annotated MIDI labels, we are able to supplement the pipeline in PART 1 by adding a naive melody frontend.
#### 2.A
Thus, the pipeline of [2.A](README-SVS-opencpop-cascade.md) can be summarized as:
```
[lyrics] + [MIDI] -> [linguistic representation (with MIDI information)] + [predicted F0] + [predicted phoneme duration] (Melody frontend)
[linguistic representation] + [predicted F0] + [predicted phoneme duration] -> [mel-spectrogram] (Acoustic model)
[mel-spectrogram] + [predicted F0] -> [waveform] (Vocoder)
```
Click here for detailed instructions: [link](README-SVS-opencpop-cascade.md).
#### 2.B
In 2.1, we find that if we predict F0 explicitly in the melody frontend, there will be many bad cases of uv/v prediction. Then, we abandon the explicit prediction of the F0 curve in the melody frontend and make a joint prediction with spectrograms.
Thus, the pipeline of [2.B](README-SVS-opencpop-e2e.md) can be summarized as:
```
[lyrics] + [MIDI] -> [linguistic representation] + [predicted phoneme duration] (Melody frontend)
[linguistic representation (with MIDI information)] + [predicted phoneme duration] -> [mel-spectrogram] (Acoustic model)
[mel-spectrogram] -> [predicted F0] (Pitch extractor)
[mel-spectrogram] + [predicted F0] -> [waveform] (Vocoder)
```
Click here for detailed instructions: [link](README-SVS-opencpop-e2e.md).
### FAQ
Q1: Why do I need F0 in Vocoders?
A1: See vocoder parts in HiFiSinger, DiffSinger or SingGAN. This is a common practice now.
Q2: Why not run MIDI version SVS on PopCS dataset? or Why not release MIDI labels for PopCS dataset?
A2: Our laboratory has no funds to label PopCS dataset. But there are funds for labeling other singing dataset, which is coming soon.
Q3: Why " 'HifiGAN' object has no attribute 'model' "?
A3: Please put the pretrained vocoders in your `checkpoints` dictionary.
Q4: How to check whether I use GT information or predicted information during inference from packed test set?
A4: Please see codes [here](https://github.com/MoonInTheRiver/DiffSinger/blob/55e2f46068af6e69940a9f8f02d306c24a940cab/tasks/tts/fs2.py#L343).
...

View File

@@ -1,69 +0,0 @@
# DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
[![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
[![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
| [Interactive🤗 TTS](https://huggingface.co/spaces/NATSpeech/DiffSpeech)
## DiffSpeech (TTS)
### 1. Preparation
#### Data Preparation
a) Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/), then create a link to the dataset folder: `ln -s /xxx/LJSpeech-1.1/ data/raw/`
b) Download and Unzip the [ground-truth duration](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/mfa_outputs.tar) extracted by [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz): `tar -xvf mfa_outputs.tar; mv mfa_outputs data/processed/ljspeech/`
c) Run the following scripts to pack the dataset for training/inference.
```sh
export PYTHONPATH=.
CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config configs/tts/lj/fs2.yaml
# `data/binary/ljspeech` will be generated.
```
#### Vocoder Preparation
We provide the pre-trained model of [HifiGAN](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0414_hifi_lj_1.zip) vocoder.
Please unzip this file into `checkpoints` before training your acoustic model.
### 2. Training Example
First, you need a pre-trained FastSpeech2 checkpoint. You can use the [pre-trained model](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip), or train FastSpeech2 from scratch, run:
```sh
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config configs/tts/lj/fs2.yaml --exp_name fs2_lj_1 --reset
```
Then, to train DiffSpeech, run:
```sh
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset
```
Remember to adjust the "fs2_ckpt" parameter in `usr/configs/lj_ds_beta6.yaml` to fit your path.
### 3. Inference Example
```sh
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset --infer
```
We also provide:
- the pre-trained model of [DiffSpeech](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/lj_ds_beta6_1213.zip);
- the individual pre-trained model of [FastSpeech 2](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip) for the shallow diffusion mechanism in DiffSpeech;
Remember to put the pre-trained models in `checkpoints` directory.
## Mel Visualization
Along vertical axis, DiffSpeech: [0-80]; FastSpeech2: [80-160].
<table style="width:100%">
<tr>
<th>DiffSpeech vs. FastSpeech 2</th>
</tr>
<tr>
<td><img src="resources/diffspeech-fs2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
</tr>
<tr>
<td><img src="resources/diffspeech-fs2-1.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
</tr>
<tr>
<td><img src="resources/diffspeech-fs2-2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
</tr>
</table>

View File

@@ -1,212 +0,0 @@
# DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
[![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
[![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
| [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-blue)](https://huggingface.co/spaces/NATSpeech/DiffSpeech)
| [English README](../README.md)
本仓库包含了我们的AAAI-2022 [论文](https://arxiv.org/abs/2105.02446)中提出的DiffSpeech (用于语音合成) 与 DiffSinger (用于歌声合成) 的官方Pytorch实现。
<table style="width:100%">
<tr>
<th>DiffSinger/DiffSpeech训练阶段</th>
<th>DiffSinger/DiffSpeech推理阶段</th>
</tr>
<tr>
<td><img src="resources/model_a.png" alt="Training" height="300"></td>
<td><img src="resources/model_b.png" alt="Inference" height="300"></td>
</tr>
</table>
:tada: :tada: :tada: **一些重要更新**:
- Mar.2, 2022: [MIDI-新版](README-SVS-opencpop-e2e.md): 重大更新 :sparkles:
- Mar.1, 2022: [NeuralSVB](https://github.com/MoonInTheRiver/NeuralSVB), 为了歌声美化任务的代码,开源了 :sparkles: :sparkles: :sparkles: .
- Feb.13, 2022: [NATSpeech](https://github.com/NATSpeech/NATSpeech), 一个升级后的代码框架, 包含了DiffSpeech和我们NeurIPS-2021的工作[PortaSpeech](https://openreview.net/forum?id=xmJsuh8xlq) 已经开源! :sparkles: :sparkles: :sparkles:.
- Jan.29, 2022: 支持了[MIDI-旧版](README-SVS-opencpop-cascade.md) 版本的歌声合成系统.
- Jan.13, 2022: 支持了歌声合成系统, 开源了PopCS数据集.
- Dec.19, 2021: 支持了语音合成系统. [HuggingFace🤗 Demo](https://huggingface.co/spaces/NATSpeech/DiffSpeech)
:rocket: **新闻**:
- Feb.24, 2022: 我们的新工作`NeuralSVB` 被 ACL-2022 接收 [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2202.13277). [音频演示](https://neuralsvb.github.io).
- Dec.01, 2021: DiffSinger被AAAI-2022接收.
- Sep.29, 2021: 我们的新工作`PortaSpeech: Portable and High-Quality Generative Text-to-Speech` 被NeurIPS-2021接收 [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2109.15166) .
- May.06, 2021: 我们把这篇DiffSinger提交到了公开论文网站: Arxiv [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446).
## 安装依赖
```sh
conda create -n your_env_name python=3.8
source activate your_env_name
pip install -r requirements_2080.txt (GPU 2080Ti, CUDA 10.2)
or pip install -r requirements_3090.txt (GPU 3090, CUDA 11.4)
```
## DiffSpeech (语音合成的版本)
### 1. 准备工作
#### 数据准备
a) 下载并解压 [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/), 创建软链接: `ln -s /xxx/LJSpeech-1.1/ data/raw/`
b) 下载并解压 [我们用MFA预处理好的对齐](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/mfa_outputs.tar): `tar -xvf mfa_outputs.tar; mv mfa_outputs data/processed/ljspeech/`
c) 按照如下脚本给数据集打包,打包后的二进制文件用于后续的训练和推理.
```sh
export PYTHONPATH=.
CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config configs/tts/lj/fs2.yaml
# `data/binary/ljspeech` will be generated.
```
#### 声码器准备
我们提供了[HifiGAN](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0414_hifi_lj_1.zip)声码器的预训练模型.
请在训练声学模型前,先把声码器文件解压到`checkpoints`里。
### 2. 训练样例
首先你需要一个预训练好的FastSpeech2存档点. 你可以用[我们预训练好的模型](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip), 或者跑下面这个指令从零开始训练FastSpeech2:
```sh
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config configs/tts/lj/fs2.yaml --exp_name fs2_lj_1 --reset
```
然后为了训练DiffSpeech, 运行:
```sh
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset
```
记得针对你的路径修改`usr/configs/lj_ds_beta6.yaml`里"fs2_ckpt"这个参数.
### 3. 推理样例
```sh
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset --infer
```
我们也提供了:
- [DiffSpeech](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/lj_ds_beta6_1213.zip)的预训练模型;
- [FastSpeech 2](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip)的预训练模型, 这是为了DiffSpeech里的浅扩散机制;
记得把预训练模型放在 `checkpoints` 目录.
## DiffSinger (歌声合成的版本)
### 0. 数据获取
- 见 [申请表](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md).
- 数据集 [预览](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_preview.zip).
### 1. Preparation
#### 数据准备
a) 下载并解压PopCS, 创建软链接: `ln -s /xxx/popcs/ data/processed/popcs`
b) 按照如下脚本给数据集打包,打包后的二进制文件用于后续的训练和推理.
```sh
export PYTHONPATH=.
CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/popcs_ds_beta6.yaml
# `data/binary/popcs-pmf0` 会生成出来.
```
#### 声码器准备
我们提供了[HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip)的预训练模型, 它专门为了歌声合成系统设计, 采用了NSF的技术。
请在训练声学模型前,先把声码器文件解压到`checkpoints`里。
(更新: 你也可以将我们提供的[训练更多步数的存档点](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt)放到声码器的文件夹里)
这个声码器是在大约70小时的较大数据集上训练的, 可以被认为是一个通用声码器。
### 2. 训练样例
首先你需要一个预训练好的FFT-Singer. 你可以用[我们预训练好的模型](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip), 或者用如下脚本从零训练FFT-Singer:
```sh
# First, train fft-singer;
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset
# Then, infer fft-singer;
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset --infer
```
然后, 为了训练DiffSinger, 运行:
```sh
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset
```
记得针对你的路径修改`usr/configs/popcs_ds_beta6_offline.yaml`里"fs2_ckpt"这个参数.
### 3. 推理样例
```sh
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset --infer
```
我们也提供了:
- [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_ds_beta6_offline_pmf0_1230.zip)的预训练模型;
- [FFT-Singer](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip)的预训练模型, 这是为了DiffSinger里的浅扩散机制;
记得把预训练模型放在 `checkpoints` 目录.
*请注意:*
-*我们原始论文中的PWG版本声码器已投入商业使用因此我们提供此HifiGAN版本声码器作为替代品。*
-*我们这篇论文假设提供真实的F0来进行实验如[1][2][3]等前作所做的那样重点在频谱建模上而非F0曲线的预测。如果你想对MIDI数据进行实验从MIDI和歌词预测F0曲线显式或隐式请查看文档[MIDI-old-version](README-SVS-opencpop-cascade.md) 或 [MIDI-new-version](README-SVS-opencpop-e2e.md)。目前已经支持的MIDI数据集有: Opencpop*
[1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020.
[2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020.
[3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020.
## Tensorboard
```sh
tensorboard --logdir_spec exp_name
```
<table style="width:100%">
<tr>
<td><img src="resources/tfb.png" alt="Tensorboard" height="250"></td>
</tr>
</table>
## Mel 可视化
沿着纵轴, DiffSpeech: [0-80]; FastSpeech2: [80-160].
<table style="width:100%">
<tr>
<th>DiffSpeech vs. FastSpeech 2</th>
</tr>
<tr>
<td><img src="resources/diffspeech-fs2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
</tr>
<tr>
<td><img src="resources/diffspeech-fs2-1.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
</tr>
<tr>
<td><img src="resources/diffspeech-fs2-2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
</tr>
</table>
## Audio Demos
音频样本可以看我们的[样例页](https://diffsinger.github.io/).
我们也放了部分由DiffSpeech+HifiGAN (标记为[P]) 和 GTmel+HifiGAN (标记为[G]) 生成的测试集音频样例在:[resources/demos_1213](../resources/demos_1213).
(对应这个预训练参数:[DiffSpeech](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/lj_ds_beta6_1213.zip))
---
:rocket: :rocket: :rocket: **更新:**
新生成的歌声样例在:[resources/demos_0112](../resources/demos_0112).
## Citation
如果本仓库对你的研究和工作有用,请引用以下论文:
@article{liu2021diffsinger,
title={Diffsinger: Singing voice synthesis via shallow diffusion mechanism},
author={Liu, Jinglin and Li, Chengxi and Ren, Yi and Chen, Feiyang and Liu, Peng and Zhao, Zhou},
journal={arXiv preprint arXiv:2105.02446},
volume={2},
year={2021}}
## 鸣谢
我们的代码基于如下仓库:
* [denoising-diffusion-pytorch](https://github.com/lucidrains/denoising-diffusion-pytorch)
* [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning)
* [ParallelWaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)
* [HifiGAN](https://github.com/jik876/hifi-gan)
* [espnet](https://github.com/espnet/espnet)
* [DiffWave](https://github.com/lmnt-com/diffwave)

View File

@@ -1 +0,0 @@
libsndfile1

View File

@@ -1,118 +0,0 @@
absl-py==0.11.0
alignment==1.0.10
altgraph==0.17
appdirs==1.4.4
async-timeout==3.0.1
audioread==2.1.9
backcall==0.2.0
blinker==1.4
brotlipy==0.7.0
cachetools==4.2.0
certifi==2020.12.5
cffi==1.14.4
chardet==4.0.0
click==7.1.2
cycler==0.10.0
Cython==0.29.21
cytoolz==0.11.0
decorator==4.4.2
Distance==0.1.3
einops==0.3.0
et-xmlfile==1.0.1
fsspec==0.8.4
future==0.18.2
g2p-en==2.1.0
g2pM==0.1.2.5
google-auth==1.24.0
google-auth-oauthlib==0.4.2
grpcio==1.34.0
h5py==3.1.0
horology==1.1.0
httplib2==0.18.1
idna==2.10
imageio==2.9.0
inflect==5.0.2
ipdb==0.13.4
ipython==7.19.0
ipython-genutils==0.2.0
jdcal==1.4.1
jedi==0.17.2
jieba==0.42.1
jiwer==2.2.0
joblib==1.0.0
kiwisolver==1.3.1
librosa==0.8.0
llvmlite==0.31.0
Markdown==3.3.3
matplotlib==3.3.3
miditoolkit==0.1.7
mido==1.2.9
music21==5.7.2
networkx==2.5
nltk==3.5
numba==0.48.0
numpy==1.19.4
oauth2client==4.1.3
oauthlib==3.1.0
olefile==0.46
packaging==20.7
pandas==1.2.0
parso==0.7.1
patsy==0.5.1
pexpect==4.8.0
pickleshare==0.7.5
Pillow==8.0.1
pooch==1.3.0
praat-parselmouth==0.3.3
prompt-toolkit==3.0.8
protobuf==3.13.0
ptyprocess==0.6.0
pyasn1==0.4.8
pyasn1-modules==0.2.8
pycparser==2.20
pycwt==0.3.0a22
Pygments==2.7.3
PyInstaller==3.6
PyJWT==1.7.1
pyloudnorm==0.1.0
pyparsing==2.4.7
pypinyin==0.39.0
PySocks==1.7.1
python-dateutil==2.8.1
python-Levenshtein==0.12.0
pytorch-lightning==0.7.1
pytz==2020.5
PyWavelets==1.1.1
pyworld==0.2.12
PyYAML==5.3.1
regex==2020.11.13
requests==2.25.1
requests-oauthlib==1.3.0
resampy==0.2.2
Resemblyzer==0.1.1.dev0
rsa==4.6
scikit-image==0.16.2
scikit-learn==0.22.2.post1
scipy==1.5.4
six==1.15.0
SoundFile==0.10.3.post1
stopit==1.1.1
tensorboard==2.4.0
tensorboard-plugin-wit==1.7.0
tensorboardX==2.1
TextGrid==1.5
threadpoolctl==2.1.0
toolz==0.11.1
torch==1.6.0
torchaudio==0.6.0
torchvision==0.7.0
tqdm==4.54.1
traitlets==5.0.5
typing==3.7.4.3
urllib3==1.26.2
uuid==1.30
wcwidth==0.2.5
webencodings==0.5.1
webrtcvad==2.0.10
Werkzeug==1.0.1
pretty-midi==0.2.9

View File

@@ -1,76 +0,0 @@
absl-py==0.15.0
appdirs==1.4.4
audioread==2.1.9
beautifulsoup4==4.10.0
certifi==2021.10.8
cffi==1.15.0
charset-normalizer==2.0.7
cycler==0.11.0
Cython==0.29.24
decorator==4.4.2
dlib==19.22.1
einops==0.3.2
future==0.18.2
g2p-en==2.1.0
google==3.0.0
grpcio==1.42.0
h5py==2.8.0
horology==1.2.0
idna==3.3
imageio==2.10.1
imageio-ffmpeg==0.4.5
importlib-metadata==4.8.1
joblib==1.1.0
kiwisolver==1.3.2
librosa==0.8.0
llvmlite==0.31.0
Markdown==3.3.4
matplotlib==3.4.3
miditoolkit==0.1.7
moviepy==1.0.3
numba==0.48.0
numpy==1.20.0
opencv-python==4.5.4.58
packaging==21.2
pandas==1.3.4
Pillow==8.4.0
pooch==1.5.2
praat-parselmouth==0.3.3
proglog==0.1.9
protobuf==3.19.1
pycparser==2.20
pycwt==0.3.0a22
pydub==0.25.1
pyloudnorm==0.1.0
pyparsing==2.4.7
pypinyin==0.43.0
python-dateutil==2.8.2
pytorch-lightning==0.7.1
pytorch-ssim==0.1
pytz==2021.3
pyworld==0.3.0
PyYAML==6.0
requests==2.26.0
resampy==0.2.2
Resemblyzer==0.1.1.dev0
scikit-image==0.16.2
scikit-learn==0.22
scipy==1.3.0
six==1.16.0
sklearn==0.0
SoundFile==0.10.3.post1
soupsieve==2.3
sympy==1.9
tensorboard==1.15.0
tensorboardX==2.4
test-tube==0.7.5
TextGrid==1.5
torch @ https://download.pytorch.org/whl/nightly/cu113/torch-1.10.0.dev20210907%2Bcu113-cp37-cp37m-linux_x86_64.whl
torchvision==0.9.1
tqdm==4.62.3
typing-extensions==3.10.0.2
urllib3==1.26.7
uuid==1.30
webrtcvad==2.0.10
Werkzeug==2.0.2
zipp==3.6.0