mirror of
https://github.com/AIGC-Audio/AudioGPT.git
synced 2025-12-16 20:07:58 +01:00
update t2s
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -4,7 +4,7 @@
|
|||||||
.circleci/
|
.circleci/
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
*__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
*$py.class
|
*$py.class
|
||||||
|
|
||||||
|
|||||||
@@ -17,7 +17,14 @@ Input Example : Generate the audio of this image<br />
|
|||||||
Output:<br />
|
Output:<br />
|
||||||
<br />
|
<br />
|
||||||
## ASR
|
## ASR
|
||||||
First uploag your audio(.wav)<br />
|
First upload your audio(.wav)<br />
|
||||||
Input Example : Generate the text of this audio<br />
|
Input Example : Generate the text of this audio<br />
|
||||||
Output:<br />
|
Output:<br />
|
||||||
<br />
|
<br />
|
||||||
|
|
||||||
|
## Style Transfer Text-To-Speech
|
||||||
|
First upload your audio(.wav)<br />
|
||||||
|
Input Example : Speak using the voice of this audio. The text is "here we go".<br />
|
||||||
|
Output:<br />
|
||||||
|
<br />
|
||||||
|
|
||||||
|
|||||||
BIN
assets/style_transfer_tts.png
Normal file
BIN
assets/style_transfer_tts.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 785 KiB |
@@ -517,8 +517,8 @@ class ConversationBot:
|
|||||||
Tool(
|
Tool(
|
||||||
name="Generate speech with unseen style derived from a reference audio acoustic reference from user input text and save it to a file", func= self.tts_ood.inference,
|
name="Generate speech with unseen style derived from a reference audio acoustic reference from user input text and save it to a file", func= self.tts_ood.inference,
|
||||||
description="useful for when you want to generate high-quality speech samples with unseen styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
|
description="useful for when you want to generate high-quality speech samples with unseen styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
|
||||||
"Like: generate a speech with unseen style derived from this custom voice. The text is xxx."
|
"Like: Generate a speech with unseen style derived from this custom voice. The text is xxx."
|
||||||
"Or speak using the voice of this audio. The text is xxx."
|
"Or Speak using the voice of this audio. The text is xxx."
|
||||||
"The input to this tool should be a comma seperated string of two, representing reference audio path and input text."),
|
"The input to this tool should be a comma seperated string of two, representing reference audio path and input text."),
|
||||||
Tool(name="Generate singing voice From User Input Text, Note and Duration Sequence", func= self.t2s.inference,
|
Tool(name="Generate singing voice From User Input Text, Note and Duration Sequence", func= self.t2s.inference,
|
||||||
description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) and save it to a file."
|
description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) and save it to a file."
|
||||||
|
|||||||
@@ -1,77 +0,0 @@
|
|||||||
! !
|
|
||||||
, ,
|
|
||||||
. .
|
|
||||||
; ;
|
|
||||||
<BOS> <BOS>
|
|
||||||
<EOS> <EOS>
|
|
||||||
? ?
|
|
||||||
AA0 AA0
|
|
||||||
AA1 AA1
|
|
||||||
AA2 AA2
|
|
||||||
AE0 AE0
|
|
||||||
AE1 AE1
|
|
||||||
AE2 AE2
|
|
||||||
AH0 AH0
|
|
||||||
AH1 AH1
|
|
||||||
AH2 AH2
|
|
||||||
AO0 AO0
|
|
||||||
AO1 AO1
|
|
||||||
AO2 AO2
|
|
||||||
AW0 AW0
|
|
||||||
AW1 AW1
|
|
||||||
AW2 AW2
|
|
||||||
AY0 AY0
|
|
||||||
AY1 AY1
|
|
||||||
AY2 AY2
|
|
||||||
B B
|
|
||||||
CH CH
|
|
||||||
D D
|
|
||||||
DH DH
|
|
||||||
EH0 EH0
|
|
||||||
EH1 EH1
|
|
||||||
EH2 EH2
|
|
||||||
ER0 ER0
|
|
||||||
ER1 ER1
|
|
||||||
ER2 ER2
|
|
||||||
EY0 EY0
|
|
||||||
EY1 EY1
|
|
||||||
EY2 EY2
|
|
||||||
F F
|
|
||||||
G G
|
|
||||||
HH HH
|
|
||||||
IH0 IH0
|
|
||||||
IH1 IH1
|
|
||||||
IH2 IH2
|
|
||||||
IY0 IY0
|
|
||||||
IY1 IY1
|
|
||||||
IY2 IY2
|
|
||||||
JH JH
|
|
||||||
K K
|
|
||||||
L L
|
|
||||||
M M
|
|
||||||
N N
|
|
||||||
NG NG
|
|
||||||
OW0 OW0
|
|
||||||
OW1 OW1
|
|
||||||
OW2 OW2
|
|
||||||
OY0 OY0
|
|
||||||
OY1 OY1
|
|
||||||
OY2 OY2
|
|
||||||
P P
|
|
||||||
R R
|
|
||||||
S S
|
|
||||||
SH SH
|
|
||||||
T T
|
|
||||||
TH TH
|
|
||||||
UH0 UH0
|
|
||||||
UH1 UH1
|
|
||||||
UH2 UH2
|
|
||||||
UW0 UW0
|
|
||||||
UW1 UW1
|
|
||||||
UW2 UW2
|
|
||||||
V V
|
|
||||||
W W
|
|
||||||
Y Y
|
|
||||||
Z Z
|
|
||||||
ZH ZH
|
|
||||||
| |
|
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1 +0,0 @@
|
|||||||
["!", ",", ".", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"]
|
|
||||||
@@ -1,398 +0,0 @@
|
|||||||
import os
|
|
||||||
import random
|
|
||||||
from copy import deepcopy
|
|
||||||
import pandas as pd
|
|
||||||
import logging
|
|
||||||
from tqdm import tqdm
|
|
||||||
import json
|
|
||||||
import glob
|
|
||||||
import re
|
|
||||||
from resemblyzer import VoiceEncoder
|
|
||||||
import traceback
|
|
||||||
import numpy as np
|
|
||||||
import pretty_midi
|
|
||||||
import librosa
|
|
||||||
from scipy.interpolate import interp1d
|
|
||||||
import torch
|
|
||||||
from textgrid import TextGrid
|
|
||||||
|
|
||||||
from utils.hparams import hparams
|
|
||||||
from data_gen.tts.data_gen_utils import build_phone_encoder, get_pitch
|
|
||||||
from utils.pitch_utils import f0_to_coarse
|
|
||||||
from data_gen.tts.base_binarizer import BaseBinarizer, BinarizationError
|
|
||||||
from data_gen.tts.binarizer_zh import ZhBinarizer
|
|
||||||
from data_gen.tts.txt_processors.zh_g2pM import ALL_YUNMU
|
|
||||||
from vocoders.base_vocoder import VOCODERS
|
|
||||||
|
|
||||||
|
|
||||||
class SingingBinarizer(BaseBinarizer):
|
|
||||||
def __init__(self, processed_data_dir=None):
|
|
||||||
if processed_data_dir is None:
|
|
||||||
processed_data_dir = hparams['processed_data_dir']
|
|
||||||
self.processed_data_dirs = processed_data_dir.split(",")
|
|
||||||
self.binarization_args = hparams['binarization_args']
|
|
||||||
self.pre_align_args = hparams['pre_align_args']
|
|
||||||
self.item2txt = {}
|
|
||||||
self.item2ph = {}
|
|
||||||
self.item2wavfn = {}
|
|
||||||
self.item2f0fn = {}
|
|
||||||
self.item2tgfn = {}
|
|
||||||
self.item2spk = {}
|
|
||||||
|
|
||||||
def split_train_test_set(self, item_names):
|
|
||||||
item_names = deepcopy(item_names)
|
|
||||||
test_item_names = [x for x in item_names if any([ts in x for ts in hparams['test_prefixes']])]
|
|
||||||
train_item_names = [x for x in item_names if x not in set(test_item_names)]
|
|
||||||
logging.info("train {}".format(len(train_item_names)))
|
|
||||||
logging.info("test {}".format(len(test_item_names)))
|
|
||||||
return train_item_names, test_item_names
|
|
||||||
|
|
||||||
def load_meta_data(self):
|
|
||||||
for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
|
|
||||||
wav_suffix = '_wf0.wav'
|
|
||||||
txt_suffix = '.txt'
|
|
||||||
ph_suffix = '_ph.txt'
|
|
||||||
tg_suffix = '.TextGrid'
|
|
||||||
all_wav_pieces = glob.glob(f'{processed_data_dir}/*/*{wav_suffix}')
|
|
||||||
|
|
||||||
for piece_path in all_wav_pieces:
|
|
||||||
item_name = raw_item_name = piece_path[len(processed_data_dir)+1:].replace('/', '-')[:-len(wav_suffix)]
|
|
||||||
if len(self.processed_data_dirs) > 1:
|
|
||||||
item_name = f'ds{ds_id}_{item_name}'
|
|
||||||
self.item2txt[item_name] = open(f'{piece_path.replace(wav_suffix, txt_suffix)}').readline()
|
|
||||||
self.item2ph[item_name] = open(f'{piece_path.replace(wav_suffix, ph_suffix)}').readline()
|
|
||||||
self.item2wavfn[item_name] = piece_path
|
|
||||||
|
|
||||||
self.item2spk[item_name] = re.split('-|#', piece_path.split('/')[-2])[0]
|
|
||||||
if len(self.processed_data_dirs) > 1:
|
|
||||||
self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
|
|
||||||
self.item2tgfn[item_name] = piece_path.replace(wav_suffix, tg_suffix)
|
|
||||||
print('spkers: ', set(self.item2spk.values()))
|
|
||||||
self.item_names = sorted(list(self.item2txt.keys()))
|
|
||||||
if self.binarization_args['shuffle']:
|
|
||||||
random.seed(1234)
|
|
||||||
random.shuffle(self.item_names)
|
|
||||||
self._train_item_names, self._test_item_names = self.split_train_test_set(self.item_names)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def train_item_names(self):
|
|
||||||
return self._train_item_names
|
|
||||||
|
|
||||||
@property
|
|
||||||
def valid_item_names(self):
|
|
||||||
return self._test_item_names
|
|
||||||
|
|
||||||
@property
|
|
||||||
def test_item_names(self):
|
|
||||||
return self._test_item_names
|
|
||||||
|
|
||||||
def process(self):
|
|
||||||
self.load_meta_data()
|
|
||||||
os.makedirs(hparams['binary_data_dir'], exist_ok=True)
|
|
||||||
self.spk_map = self.build_spk_map()
|
|
||||||
print("| spk_map: ", self.spk_map)
|
|
||||||
spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
|
|
||||||
json.dump(self.spk_map, open(spk_map_fn, 'w'))
|
|
||||||
|
|
||||||
self.phone_encoder = self._phone_encoder()
|
|
||||||
self.process_data('valid')
|
|
||||||
self.process_data('test')
|
|
||||||
self.process_data('train')
|
|
||||||
|
|
||||||
def _phone_encoder(self):
|
|
||||||
ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
|
|
||||||
ph_set = []
|
|
||||||
if hparams['reset_phone_dict'] or not os.path.exists(ph_set_fn):
|
|
||||||
for ph_sent in self.item2ph.values():
|
|
||||||
ph_set += ph_sent.split(' ')
|
|
||||||
ph_set = sorted(set(ph_set))
|
|
||||||
json.dump(ph_set, open(ph_set_fn, 'w'))
|
|
||||||
print("| Build phone set: ", ph_set)
|
|
||||||
else:
|
|
||||||
ph_set = json.load(open(ph_set_fn, 'r'))
|
|
||||||
print("| Load phone set: ", ph_set)
|
|
||||||
return build_phone_encoder(hparams['binary_data_dir'])
|
|
||||||
|
|
||||||
# @staticmethod
|
|
||||||
# def get_pitch(wav_fn, spec, res):
|
|
||||||
# wav_suffix = '_wf0.wav'
|
|
||||||
# f0_suffix = '_f0.npy'
|
|
||||||
# f0fn = wav_fn.replace(wav_suffix, f0_suffix)
|
|
||||||
# pitch_info = np.load(f0fn)
|
|
||||||
# f0 = [x[1] for x in pitch_info]
|
|
||||||
# spec_x_coor = np.arange(0, 1, 1 / len(spec))[:len(spec)]
|
|
||||||
# f0_x_coor = np.arange(0, 1, 1 / len(f0))[:len(f0)]
|
|
||||||
# f0 = interp1d(f0_x_coor, f0, 'nearest', fill_value='extrapolate')(spec_x_coor)[:len(spec)]
|
|
||||||
# # f0_x_coor = np.arange(0, 1, 1 / len(f0))
|
|
||||||
# # f0_x_coor[-1] = 1
|
|
||||||
# # f0 = interp1d(f0_x_coor, f0, 'nearest')(spec_x_coor)[:len(spec)]
|
|
||||||
# if sum(f0) == 0:
|
|
||||||
# raise BinarizationError("Empty f0")
|
|
||||||
# assert len(f0) == len(spec), (len(f0), len(spec))
|
|
||||||
# pitch_coarse = f0_to_coarse(f0)
|
|
||||||
#
|
|
||||||
# # vis f0
|
|
||||||
# # import matplotlib.pyplot as plt
|
|
||||||
# # from textgrid import TextGrid
|
|
||||||
# # tg_fn = wav_fn.replace(wav_suffix, '.TextGrid')
|
|
||||||
# # fig = plt.figure(figsize=(12, 6))
|
|
||||||
# # plt.pcolor(spec.T, vmin=-5, vmax=0)
|
|
||||||
# # ax = plt.gca()
|
|
||||||
# # ax2 = ax.twinx()
|
|
||||||
# # ax2.plot(f0, color='red')
|
|
||||||
# # ax2.set_ylim(0, 800)
|
|
||||||
# # itvs = TextGrid.fromFile(tg_fn)[0]
|
|
||||||
# # for itv in itvs:
|
|
||||||
# # x = itv.maxTime * hparams['audio_sample_rate'] / hparams['hop_size']
|
|
||||||
# # plt.vlines(x=x, ymin=0, ymax=80, color='black')
|
|
||||||
# # plt.text(x=x, y=20, s=itv.mark, color='black')
|
|
||||||
# # plt.savefig('tmp/20211229_singing_plots_test.png')
|
|
||||||
#
|
|
||||||
# res['f0'] = f0
|
|
||||||
# res['pitch'] = pitch_coarse
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
|
|
||||||
if hparams['vocoder'] in VOCODERS:
|
|
||||||
wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
|
|
||||||
else:
|
|
||||||
wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
|
|
||||||
res = {
|
|
||||||
'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
|
|
||||||
'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
|
|
||||||
}
|
|
||||||
try:
|
|
||||||
if binarization_args['with_f0']:
|
|
||||||
# cls.get_pitch(wav_fn, mel, res)
|
|
||||||
cls.get_pitch(wav, mel, res)
|
|
||||||
if binarization_args['with_txt']:
|
|
||||||
try:
|
|
||||||
# print(ph)
|
|
||||||
phone_encoded = res['phone'] = encoder.encode(ph)
|
|
||||||
except:
|
|
||||||
traceback.print_exc()
|
|
||||||
raise BinarizationError(f"Empty phoneme")
|
|
||||||
if binarization_args['with_align']:
|
|
||||||
cls.get_align(tg_fn, ph, mel, phone_encoded, res)
|
|
||||||
except BinarizationError as e:
|
|
||||||
print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
|
|
||||||
return None
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
class MidiSingingBinarizer(SingingBinarizer):
|
|
||||||
item2midi = {}
|
|
||||||
item2midi_dur = {}
|
|
||||||
item2is_slur = {}
|
|
||||||
item2ph_durs = {}
|
|
||||||
item2wdb = {}
|
|
||||||
|
|
||||||
def load_meta_data(self):
|
|
||||||
for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
|
|
||||||
meta_midi = json.load(open(os.path.join(processed_data_dir, 'meta.json'))) # [list of dict]
|
|
||||||
|
|
||||||
for song_item in meta_midi:
|
|
||||||
item_name = raw_item_name = song_item['item_name']
|
|
||||||
if len(self.processed_data_dirs) > 1:
|
|
||||||
item_name = f'ds{ds_id}_{item_name}'
|
|
||||||
self.item2wavfn[item_name] = song_item['wav_fn']
|
|
||||||
self.item2txt[item_name] = song_item['txt']
|
|
||||||
|
|
||||||
self.item2ph[item_name] = ' '.join(song_item['phs'])
|
|
||||||
self.item2wdb[item_name] = [1 if x in ALL_YUNMU + ['AP', 'SP', '<SIL>'] else 0 for x in song_item['phs']]
|
|
||||||
self.item2ph_durs[item_name] = song_item['ph_dur']
|
|
||||||
|
|
||||||
self.item2midi[item_name] = song_item['notes']
|
|
||||||
self.item2midi_dur[item_name] = song_item['notes_dur']
|
|
||||||
self.item2is_slur[item_name] = song_item['is_slur']
|
|
||||||
self.item2spk[item_name] = 'pop-cs'
|
|
||||||
if len(self.processed_data_dirs) > 1:
|
|
||||||
self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
|
|
||||||
|
|
||||||
print('spkers: ', set(self.item2spk.values()))
|
|
||||||
self.item_names = sorted(list(self.item2txt.keys()))
|
|
||||||
if self.binarization_args['shuffle']:
|
|
||||||
random.seed(1234)
|
|
||||||
random.shuffle(self.item_names)
|
|
||||||
self._train_item_names, self._test_item_names = self.split_train_test_set(self.item_names)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_pitch(wav_fn, wav, spec, ph, res):
|
|
||||||
wav_suffix = '.wav'
|
|
||||||
# midi_suffix = '.mid'
|
|
||||||
wav_dir = 'wavs'
|
|
||||||
f0_dir = 'f0'
|
|
||||||
|
|
||||||
item_name = '/'.join(os.path.splitext(wav_fn)[0].split('/')[-2:]).replace('_wf0', '')
|
|
||||||
res['pitch_midi'] = np.asarray(MidiSingingBinarizer.item2midi[item_name])
|
|
||||||
res['midi_dur'] = np.asarray(MidiSingingBinarizer.item2midi_dur[item_name])
|
|
||||||
res['is_slur'] = np.asarray(MidiSingingBinarizer.item2is_slur[item_name])
|
|
||||||
res['word_boundary'] = np.asarray(MidiSingingBinarizer.item2wdb[item_name])
|
|
||||||
assert res['pitch_midi'].shape == res['midi_dur'].shape == res['is_slur'].shape, (
|
|
||||||
res['pitch_midi'].shape, res['midi_dur'].shape, res['is_slur'].shape)
|
|
||||||
|
|
||||||
# gt f0.
|
|
||||||
gt_f0, gt_pitch_coarse = get_pitch(wav, spec, hparams)
|
|
||||||
if sum(gt_f0) == 0:
|
|
||||||
raise BinarizationError("Empty **gt** f0")
|
|
||||||
res['f0'] = gt_f0
|
|
||||||
res['pitch'] = gt_pitch_coarse
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_align(ph_durs, mel, phone_encoded, res, hop_size=hparams['hop_size'], audio_sample_rate=hparams['audio_sample_rate']):
|
|
||||||
mel2ph = np.zeros([mel.shape[0]], int)
|
|
||||||
startTime = 0
|
|
||||||
|
|
||||||
for i_ph in range(len(ph_durs)):
|
|
||||||
start_frame = int(startTime * audio_sample_rate / hop_size + 0.5)
|
|
||||||
end_frame = int((startTime + ph_durs[i_ph]) * audio_sample_rate / hop_size + 0.5)
|
|
||||||
mel2ph[start_frame:end_frame] = i_ph + 1
|
|
||||||
startTime = startTime + ph_durs[i_ph]
|
|
||||||
|
|
||||||
# print('ph durs: ', ph_durs)
|
|
||||||
# print('mel2ph: ', mel2ph, len(mel2ph))
|
|
||||||
res['mel2ph'] = mel2ph
|
|
||||||
# res['dur'] = None
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
|
|
||||||
if hparams['vocoder'] in VOCODERS:
|
|
||||||
wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
|
|
||||||
else:
|
|
||||||
wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
|
|
||||||
res = {
|
|
||||||
'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
|
|
||||||
'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
|
|
||||||
}
|
|
||||||
try:
|
|
||||||
if binarization_args['with_f0']:
|
|
||||||
cls.get_pitch(wav_fn, wav, mel, ph, res)
|
|
||||||
if binarization_args['with_txt']:
|
|
||||||
try:
|
|
||||||
phone_encoded = res['phone'] = encoder.encode(ph)
|
|
||||||
except:
|
|
||||||
traceback.print_exc()
|
|
||||||
raise BinarizationError(f"Empty phoneme")
|
|
||||||
if binarization_args['with_align']:
|
|
||||||
cls.get_align(MidiSingingBinarizer.item2ph_durs[item_name], mel, phone_encoded, res)
|
|
||||||
except BinarizationError as e:
|
|
||||||
print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
|
|
||||||
return None
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
class ZhSingingBinarizer(ZhBinarizer, SingingBinarizer):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class OpencpopBinarizer(MidiSingingBinarizer):
|
|
||||||
item2midi = {}
|
|
||||||
item2midi_dur = {}
|
|
||||||
item2is_slur = {}
|
|
||||||
item2ph_durs = {}
|
|
||||||
item2wdb = {}
|
|
||||||
|
|
||||||
def split_train_test_set(self, item_names):
|
|
||||||
item_names = deepcopy(item_names)
|
|
||||||
test_item_names = [x for x in item_names if any([x.startswith(ts) for ts in hparams['test_prefixes']])]
|
|
||||||
train_item_names = [x for x in item_names if x not in set(test_item_names)]
|
|
||||||
logging.info("train {}".format(len(train_item_names)))
|
|
||||||
logging.info("test {}".format(len(test_item_names)))
|
|
||||||
return train_item_names, test_item_names
|
|
||||||
|
|
||||||
def load_meta_data(self):
|
|
||||||
raw_data_dir = hparams['raw_data_dir']
|
|
||||||
# meta_midi = json.load(open(os.path.join(raw_data_dir, 'meta.json'))) # [list of dict]
|
|
||||||
utterance_labels = open(os.path.join(raw_data_dir, 'transcriptions.txt')).readlines()
|
|
||||||
|
|
||||||
for utterance_label in utterance_labels:
|
|
||||||
song_info = utterance_label.split('|')
|
|
||||||
item_name = raw_item_name = song_info[0]
|
|
||||||
self.item2wavfn[item_name] = f'{raw_data_dir}/wavs/{item_name}.wav'
|
|
||||||
self.item2txt[item_name] = song_info[1]
|
|
||||||
|
|
||||||
self.item2ph[item_name] = song_info[2]
|
|
||||||
# self.item2wdb[item_name] = list(np.nonzero([1 if x in ALL_YUNMU + ['AP', 'SP'] else 0 for x in song_info[2].split()])[0])
|
|
||||||
self.item2wdb[item_name] = [1 if x in ALL_YUNMU + ['AP', 'SP'] else 0 for x in song_info[2].split()]
|
|
||||||
self.item2ph_durs[item_name] = [float(x) for x in song_info[5].split(" ")]
|
|
||||||
|
|
||||||
self.item2midi[item_name] = [librosa.note_to_midi(x.split("/")[0]) if x != 'rest' else 0
|
|
||||||
for x in song_info[3].split(" ")]
|
|
||||||
self.item2midi_dur[item_name] = [float(x) for x in song_info[4].split(" ")]
|
|
||||||
self.item2is_slur[item_name] = [int(x) for x in song_info[6].split(" ")]
|
|
||||||
self.item2spk[item_name] = 'opencpop'
|
|
||||||
|
|
||||||
print('spkers: ', set(self.item2spk.values()))
|
|
||||||
self.item_names = sorted(list(self.item2txt.keys()))
|
|
||||||
if self.binarization_args['shuffle']:
|
|
||||||
random.seed(1234)
|
|
||||||
random.shuffle(self.item_names)
|
|
||||||
self._train_item_names, self._test_item_names = self.split_train_test_set(self.item_names)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_pitch(wav_fn, wav, spec, ph, res):
|
|
||||||
wav_suffix = '.wav'
|
|
||||||
# midi_suffix = '.mid'
|
|
||||||
wav_dir = 'wavs'
|
|
||||||
f0_dir = 'text_f0_align'
|
|
||||||
|
|
||||||
item_name = os.path.splitext(os.path.basename(wav_fn))[0]
|
|
||||||
res['pitch_midi'] = np.asarray(OpencpopBinarizer.item2midi[item_name])
|
|
||||||
res['midi_dur'] = np.asarray(OpencpopBinarizer.item2midi_dur[item_name])
|
|
||||||
res['is_slur'] = np.asarray(OpencpopBinarizer.item2is_slur[item_name])
|
|
||||||
res['word_boundary'] = np.asarray(OpencpopBinarizer.item2wdb[item_name])
|
|
||||||
assert res['pitch_midi'].shape == res['midi_dur'].shape == res['is_slur'].shape, (res['pitch_midi'].shape, res['midi_dur'].shape, res['is_slur'].shape)
|
|
||||||
|
|
||||||
# gt f0.
|
|
||||||
# f0 = None
|
|
||||||
# f0_suffix = '_f0.npy'
|
|
||||||
# f0fn = wav_fn.replace(wav_suffix, f0_suffix).replace(wav_dir, f0_dir)
|
|
||||||
# pitch_info = np.load(f0fn)
|
|
||||||
# f0 = [x[1] for x in pitch_info]
|
|
||||||
# spec_x_coor = np.arange(0, 1, 1 / len(spec))[:len(spec)]
|
|
||||||
#
|
|
||||||
# f0_x_coor = np.arange(0, 1, 1 / len(f0))[:len(f0)]
|
|
||||||
# f0 = interp1d(f0_x_coor, f0, 'nearest', fill_value='extrapolate')(spec_x_coor)[:len(spec)]
|
|
||||||
# if sum(f0) == 0:
|
|
||||||
# raise BinarizationError("Empty **gt** f0")
|
|
||||||
#
|
|
||||||
# pitch_coarse = f0_to_coarse(f0)
|
|
||||||
# res['f0'] = f0
|
|
||||||
# res['pitch'] = pitch_coarse
|
|
||||||
|
|
||||||
# gt f0.
|
|
||||||
gt_f0, gt_pitch_coarse = get_pitch(wav, spec, hparams)
|
|
||||||
if sum(gt_f0) == 0:
|
|
||||||
raise BinarizationError("Empty **gt** f0")
|
|
||||||
res['f0'] = gt_f0
|
|
||||||
res['pitch'] = gt_pitch_coarse
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
|
|
||||||
if hparams['vocoder'] in VOCODERS:
|
|
||||||
wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
|
|
||||||
else:
|
|
||||||
wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
|
|
||||||
res = {
|
|
||||||
'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
|
|
||||||
'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
|
|
||||||
}
|
|
||||||
try:
|
|
||||||
if binarization_args['with_f0']:
|
|
||||||
cls.get_pitch(wav_fn, wav, mel, ph, res)
|
|
||||||
if binarization_args['with_txt']:
|
|
||||||
try:
|
|
||||||
phone_encoded = res['phone'] = encoder.encode(ph)
|
|
||||||
except:
|
|
||||||
traceback.print_exc()
|
|
||||||
raise BinarizationError(f"Empty phoneme")
|
|
||||||
if binarization_args['with_align']:
|
|
||||||
cls.get_align(OpencpopBinarizer.item2ph_durs[item_name], mel, phone_encoded, res)
|
|
||||||
except BinarizationError as e:
|
|
||||||
print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
|
|
||||||
return None
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
SingingBinarizer().process()
|
|
||||||
@@ -1,20 +0,0 @@
|
|||||||
import os
|
|
||||||
|
|
||||||
os.environ["OMP_NUM_THREADS"] = "1"
|
|
||||||
|
|
||||||
import importlib
|
|
||||||
from utils.hparams import set_hparams, hparams
|
|
||||||
|
|
||||||
|
|
||||||
def binarize():
|
|
||||||
binarizer_cls = hparams.get("binarizer_cls", 'data_gen.tts.base_binarizer.BaseBinarizer')
|
|
||||||
pkg = ".".join(binarizer_cls.split(".")[:-1])
|
|
||||||
cls_name = binarizer_cls.split(".")[-1]
|
|
||||||
binarizer_cls = getattr(importlib.import_module(pkg), cls_name)
|
|
||||||
print("| Binarizer: ", binarizer_cls)
|
|
||||||
binarizer_cls().process()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
set_hparams()
|
|
||||||
binarize()
|
|
||||||
@@ -1,17 +0,0 @@
|
|||||||
from utils.hparams import set_hparams, hparams
|
|
||||||
|
|
||||||
import importlib
|
|
||||||
|
|
||||||
|
|
||||||
def preprocess():
|
|
||||||
assert hparams['preprocess_cls'] != ''
|
|
||||||
|
|
||||||
pkg = ".".join(hparams["preprocess_cls"].split(".")[:-1])
|
|
||||||
cls_name = hparams["preprocess_cls"].split(".")[-1]
|
|
||||||
process_cls = getattr(importlib.import_module(pkg), cls_name)
|
|
||||||
process_cls().process()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
set_hparams()
|
|
||||||
preprocess()
|
|
||||||
@@ -1,15 +0,0 @@
|
|||||||
import subprocess
|
|
||||||
from utils.hparams import hparams, set_hparams
|
|
||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
def train_mfa_align():
|
|
||||||
CORPUS = hparams['processed_data_dir'].split("/")[-1]
|
|
||||||
print(f"| Run MFA for {CORPUS}.")
|
|
||||||
NUM_JOB = int(os.getenv('N_PROC', os.cpu_count()))
|
|
||||||
subprocess.check_call(f'CORPUS={CORPUS} NUM_JOB={NUM_JOB} bash data_gen/tts/scripts/run_mfa_train_align.sh', shell=True)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
set_hparams(print_hparams=False)
|
|
||||||
train_mfa_align()
|
|
||||||
@@ -1,111 +0,0 @@
|
|||||||
# DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
|
|
||||||
[](https://arxiv.org/abs/2105.02446)
|
|
||||||
[](https://github.com/MoonInTheRiver/DiffSinger)
|
|
||||||
[](https://github.com/MoonInTheRiver/DiffSinger/releases)
|
|
||||||
|
|
||||||
## DiffSinger (MIDI SVS | A version)
|
|
||||||
### 0. Data Acquirement
|
|
||||||
For Opencpop dataset: Please strictly follow the instructions of [Opencpop](https://wenet.org.cn/opencpop/). We have no right to give you the access to Opencpop.
|
|
||||||
|
|
||||||
The pipeline below is designed for Opencpop dataset:
|
|
||||||
|
|
||||||
### 1. Preparation
|
|
||||||
|
|
||||||
#### Data Preparation
|
|
||||||
a) Download and extract Opencpop, then create a link to the dataset folder: `ln -s /xxx/opencpop data/raw/`
|
|
||||||
|
|
||||||
b) Run the following scripts to pack the dataset for training/inference.
|
|
||||||
|
|
||||||
```sh
|
|
||||||
export PYTHONPATH=.
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml
|
|
||||||
|
|
||||||
# `data/binary/opencpop-midi-dp` will be generated.
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Vocoder Preparation
|
|
||||||
We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
|
|
||||||
Please unzip this file into `checkpoints` before training your acoustic model.
|
|
||||||
|
|
||||||
(Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
|
|
||||||
|
|
||||||
This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder.
|
|
||||||
|
|
||||||
#### Exp Name Preparation
|
|
||||||
```bash
|
|
||||||
export MY_FS_EXP_NAME=0302_opencpop_fs_midi
|
|
||||||
export MY_DS_EXP_NAME=0303_opencpop_ds58_midi
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
.
|
|
||||||
|--data
|
|
||||||
|--raw
|
|
||||||
|--opencpop
|
|
||||||
|--segments
|
|
||||||
|--transcriptions.txt
|
|
||||||
|--wavs
|
|
||||||
|--checkpoints
|
|
||||||
|--MY_FS_EXP_NAME (optional)
|
|
||||||
|--MY_DS_EXP_NAME (optional)
|
|
||||||
|--0109_hifigan_bigpopcs_hop128
|
|
||||||
|--model_ckpt_steps_1512000.ckpt
|
|
||||||
|--config.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Training Example
|
|
||||||
First, you need a pre-trained FFT-Singer checkpoint. You can use the pre-trained model, or train FFT-Singer from scratch, run:
|
|
||||||
```sh
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml --exp_name $MY_FS_EXP_NAME --reset
|
|
||||||
```
|
|
||||||
|
|
||||||
Then, to train DiffSinger, run:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME --reset
|
|
||||||
```
|
|
||||||
|
|
||||||
Remember to adjust the "fs2_ckpt" parameter in `usr/configs/midi/cascade/opencs/ds60_rel.yaml` to fit your path.
|
|
||||||
|
|
||||||
### 3. Inference from packed test set
|
|
||||||
```sh
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME --reset --infer
|
|
||||||
```
|
|
||||||
|
|
||||||
We also provide:
|
|
||||||
- the pre-trained model of DiffSinger;
|
|
||||||
- the pre-trained model of FFT-Singer;
|
|
||||||
|
|
||||||
They can be found in [here](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/adjust-receptive-field.zip).
|
|
||||||
|
|
||||||
Remember to put the pre-trained models in `checkpoints` directory.
|
|
||||||
|
|
||||||
### 4. Inference from raw inputs
|
|
||||||
```sh
|
|
||||||
python inference/svs/ds_cascade.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME
|
|
||||||
```
|
|
||||||
Raw inputs:
|
|
||||||
```
|
|
||||||
inp = {
|
|
||||||
'text': '小酒窝长睫毛AP是你最美的记号',
|
|
||||||
'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
|
|
||||||
'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
|
|
||||||
'input_type': 'word'
|
|
||||||
} # user input: Chinese characters
|
|
||||||
or,
|
|
||||||
inp = {
|
|
||||||
'text': '小酒窝长睫毛AP是你最美的记号',
|
|
||||||
'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
|
|
||||||
'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
|
|
||||||
'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
|
|
||||||
'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
|
|
||||||
'input_type': 'phoneme'
|
|
||||||
} # input like Opencpop dataset.
|
|
||||||
```
|
|
||||||
|
|
||||||
### 5. Some issues.
|
|
||||||
a) the HifiGAN-Singing is trained on our [vocoder dataset](https://dl.acm.org/doi/abs/10.1145/3474085.3475437) and the training set of [PopCS](https://arxiv.org/abs/2105.02446). Opencpop is the out-of-domain dataset (unseen speaker). This may cause the deterioration of audio quality, and we are considering fine-tuning this vocoder on the training set of Opencpop.
|
|
||||||
|
|
||||||
b) in this version of codes, we used the melody frontend ([lyric + MIDI]->[F0+ph_dur]) to predict F0 contour and phoneme duration.
|
|
||||||
|
|
||||||
c) generated audio demos can be found in [MY_DS_EXP_NAME](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/adjust-receptive-field.zip).
|
|
||||||
@@ -1,107 +0,0 @@
|
|||||||
# DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
|
|
||||||
[](https://arxiv.org/abs/2105.02446)
|
|
||||||
[](https://github.com/MoonInTheRiver/DiffSinger)
|
|
||||||
[](https://github.com/MoonInTheRiver/DiffSinger/releases)
|
|
||||||
| [Interactive🤗 SVS](https://huggingface.co/spaces/Silentlin/DiffSinger)
|
|
||||||
|
|
||||||
Substantial update: We 1) **abandon** the explicit prediction of the F0 curve; 2) increase the receptive field of the denoiser; 3) make the linguistic encoder more robust.
|
|
||||||
**By doing so, 1) the synthesized recordings are more natural in terms of pitch; 2) the pipeline is simpler.**
|
|
||||||
|
|
||||||
简而言之,把F0曲线的动态性交给生成式模型去捕捉,而不再是以前那样用MSE约束对数域F0。
|
|
||||||
|
|
||||||
## DiffSinger (MIDI SVS | B version)
|
|
||||||
### 0. Data Acquirement
|
|
||||||
For Opencpop dataset: Please strictly follow the instructions of [Opencpop](https://wenet.org.cn/opencpop/). We have no right to give you the access to Opencpop.
|
|
||||||
|
|
||||||
The pipeline below is designed for Opencpop dataset:
|
|
||||||
|
|
||||||
### 1. Preparation
|
|
||||||
|
|
||||||
#### Data Preparation
|
|
||||||
a) Download and extract Opencpop, then create a link to the dataset folder: `ln -s /xxx/opencpop data/raw/`
|
|
||||||
|
|
||||||
b) Run the following scripts to pack the dataset for training/inference.
|
|
||||||
|
|
||||||
```sh
|
|
||||||
export PYTHONPATH=.
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml
|
|
||||||
|
|
||||||
# `data/binary/opencpop-midi-dp` will be generated.
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Vocoder Preparation
|
|
||||||
We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
|
|
||||||
|
|
||||||
Also, please unzip pre-trained vocoder and [this pendant for vocoder](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0102_xiaoma_pe.zip) into `checkpoints` before training your acoustic model.
|
|
||||||
|
|
||||||
(Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
|
|
||||||
|
|
||||||
This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder.
|
|
||||||
|
|
||||||
#### Exp Name Preparation
|
|
||||||
```bash
|
|
||||||
export MY_DS_EXP_NAME=0228_opencpop_ds100_rel
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
.
|
|
||||||
|--data
|
|
||||||
|--raw
|
|
||||||
|--opencpop
|
|
||||||
|--segments
|
|
||||||
|--transcriptions.txt
|
|
||||||
|--wavs
|
|
||||||
|--checkpoints
|
|
||||||
|--MY_DS_EXP_NAME (optional)
|
|
||||||
|--0109_hifigan_bigpopcs_hop128 (vocoder)
|
|
||||||
|--model_ckpt_steps_1512000.ckpt
|
|
||||||
|--config.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Training Example
|
|
||||||
```sh
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME --reset
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Inference from packed test set
|
|
||||||
```sh
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME --reset --infer
|
|
||||||
```
|
|
||||||
|
|
||||||
We also provide:
|
|
||||||
- the pre-trained model of DiffSinger;
|
|
||||||
|
|
||||||
They can be found in [here](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0228_opencpop_ds100_rel.zip).
|
|
||||||
|
|
||||||
Remember to put the pre-trained models in `checkpoints` directory.
|
|
||||||
|
|
||||||
### 4. Inference from raw inputs
|
|
||||||
```sh
|
|
||||||
python inference/svs/ds_e2e.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME
|
|
||||||
```
|
|
||||||
Raw inputs:
|
|
||||||
```
|
|
||||||
inp = {
|
|
||||||
'text': '小酒窝长睫毛AP是你最美的记号',
|
|
||||||
'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
|
|
||||||
'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
|
|
||||||
'input_type': 'word'
|
|
||||||
} # user input: Chinese characters
|
|
||||||
or,
|
|
||||||
inp = {
|
|
||||||
'text': '小酒窝长睫毛AP是你最美的记号',
|
|
||||||
'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
|
|
||||||
'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
|
|
||||||
'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
|
|
||||||
'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
|
|
||||||
'input_type': 'phoneme'
|
|
||||||
} # input like Opencpop dataset.
|
|
||||||
```
|
|
||||||
|
|
||||||
### 5. Some issues.
|
|
||||||
a) the HifiGAN-Singing is trained on our [vocoder dataset](https://dl.acm.org/doi/abs/10.1145/3474085.3475437) and the training set of [PopCS](https://arxiv.org/abs/2105.02446). Opencpop is the out-of-domain dataset (unseen speaker). This may cause the deterioration of audio quality, and we are considering fine-tuning this vocoder on the training set of Opencpop.
|
|
||||||
|
|
||||||
b) in this version of codes, we used the melody frontend ([lyric + MIDI]->[ph_dur]) to predict phoneme duration. F0 curve is implicitly predicted together with mel-spectrogram.
|
|
||||||
|
|
||||||
c) example [generated audio](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/demos_0221/DS/).
|
|
||||||
More generated audio demos can be found in [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0228_opencpop_ds100_rel.zip).
|
|
||||||
@@ -1,63 +0,0 @@
|
|||||||
## DiffSinger (SVS version)
|
|
||||||
|
|
||||||
### 0. Data Acquirement
|
|
||||||
- See in [apply_form](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md).
|
|
||||||
- Dataset [preview](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_preview.zip).
|
|
||||||
|
|
||||||
### 1. Preparation
|
|
||||||
#### Data Preparation
|
|
||||||
a) Download and extract PopCS, then create a link to the dataset folder: `ln -s /xxx/popcs/ data/processed/popcs`
|
|
||||||
|
|
||||||
b) Run the following scripts to pack the dataset for training/inference.
|
|
||||||
```sh
|
|
||||||
export PYTHONPATH=.
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/popcs_ds_beta6.yaml
|
|
||||||
# `data/binary/popcs-pmf0` will be generated.
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Vocoder Preparation
|
|
||||||
We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
|
|
||||||
Please unzip this file into `checkpoints` before training your acoustic model.
|
|
||||||
|
|
||||||
(Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
|
|
||||||
|
|
||||||
This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder.
|
|
||||||
|
|
||||||
### 2. Training Example
|
|
||||||
First, you need a pre-trained FFT-Singer checkpoint. You can use the [pre-trained model](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip), or train FFT-Singer from scratch, run:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
# First, train fft-singer;
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset
|
|
||||||
# Then, infer fft-singer;
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset --infer
|
|
||||||
```
|
|
||||||
|
|
||||||
Then, to train DiffSinger, run:
|
|
||||||
```sh
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset
|
|
||||||
```
|
|
||||||
|
|
||||||
Remember to adjust the "fs2_ckpt" parameter in `usr/configs/popcs_ds_beta6_offline.yaml` to fit your path.
|
|
||||||
|
|
||||||
### 3. Inference Example
|
|
||||||
```sh
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset --infer
|
|
||||||
```
|
|
||||||
|
|
||||||
We also provide:
|
|
||||||
- the pre-trained model of [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_ds_beta6_offline_pmf0_1230.zip);
|
|
||||||
- the pre-trained model of [FFT-Singer](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip) for the shallow diffusion mechanism in DiffSinger;
|
|
||||||
|
|
||||||
Remember to put the pre-trained models in `checkpoints` directory.
|
|
||||||
|
|
||||||
*Note that:*
|
|
||||||
|
|
||||||
- *the original PWG version vocoder in the paper we used has been put into commercial use, so we provide this HifiGAN version vocoder as a substitute.*
|
|
||||||
- *we assume the ground-truth F0 to be given as the pitch information following [1][2][3]. If you want to conduct experiments on MIDI data, you need an external F0 predictor (like [MIDI-A-version](README-SVS-opencpop-cascade.md)) or a joint prediction with spectrograms(like [MIDI-B-version](README-SVS-opencpop-e2e.md)).*
|
|
||||||
|
|
||||||
[1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020.
|
|
||||||
|
|
||||||
[2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020.
|
|
||||||
|
|
||||||
[3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020.
|
|
||||||
@@ -1,76 +0,0 @@
|
|||||||
# DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
|
|
||||||
[](https://arxiv.org/abs/2105.02446)
|
|
||||||
[](https://github.com/MoonInTheRiver/DiffSinger)
|
|
||||||
[](https://github.com/MoonInTheRiver/DiffSinger/releases)
|
|
||||||
| [Interactive🤗 SVS](https://huggingface.co/spaces/Silentlin/DiffSinger)
|
|
||||||
|
|
||||||
## DiffSinger (SVS)
|
|
||||||
|
|
||||||
### PART1. [Run DiffSinger on PopCS](README-SVS-popcs.md)
|
|
||||||
In PART1, we only focus on spectrum modeling (acoustic model) and assume the ground-truth (GT) F0 to be given as the pitch information following these papers [1][2][3]. If you want to conduct experiments with F0 prediction, please move to PART2.
|
|
||||||
|
|
||||||
Thus, the pipeline of this part can be summarized as:
|
|
||||||
|
|
||||||
```
|
|
||||||
[lyrics] -> [linguistic representation] (Frontend)
|
|
||||||
[linguistic representation] + [GT F0] + [GT phoneme duration] -> [mel-spectrogram] (Acoustic model)
|
|
||||||
[mel-spectrogram] + [GT F0] -> [waveform] (Vocoder)
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
[1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020.
|
|
||||||
|
|
||||||
[2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020.
|
|
||||||
|
|
||||||
[3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020.
|
|
||||||
|
|
||||||
Click here for detailed instructions: [link](README-SVS-popcs.md).
|
|
||||||
|
|
||||||
|
|
||||||
### PART2. [Run DiffSinger on Opencpop](README-SVS-opencpop-cascade.md)
|
|
||||||
Thanks [Opencpop team](https://wenet.org.cn/opencpop/) for releasing their SVS dataset with MIDI label, **Jan.20, 2022** (after we published our paper).
|
|
||||||
|
|
||||||
Since there are elaborately annotated MIDI labels, we are able to supplement the pipeline in PART 1 by adding a naive melody frontend.
|
|
||||||
|
|
||||||
#### 2.A
|
|
||||||
Thus, the pipeline of [2.A](README-SVS-opencpop-cascade.md) can be summarized as:
|
|
||||||
|
|
||||||
```
|
|
||||||
[lyrics] + [MIDI] -> [linguistic representation (with MIDI information)] + [predicted F0] + [predicted phoneme duration] (Melody frontend)
|
|
||||||
[linguistic representation] + [predicted F0] + [predicted phoneme duration] -> [mel-spectrogram] (Acoustic model)
|
|
||||||
[mel-spectrogram] + [predicted F0] -> [waveform] (Vocoder)
|
|
||||||
```
|
|
||||||
|
|
||||||
Click here for detailed instructions: [link](README-SVS-opencpop-cascade.md).
|
|
||||||
|
|
||||||
#### 2.B
|
|
||||||
In 2.1, we find that if we predict F0 explicitly in the melody frontend, there will be many bad cases of uv/v prediction. Then, we abandon the explicit prediction of the F0 curve in the melody frontend and make a joint prediction with spectrograms.
|
|
||||||
|
|
||||||
Thus, the pipeline of [2.B](README-SVS-opencpop-e2e.md) can be summarized as:
|
|
||||||
```
|
|
||||||
[lyrics] + [MIDI] -> [linguistic representation] + [predicted phoneme duration] (Melody frontend)
|
|
||||||
[linguistic representation (with MIDI information)] + [predicted phoneme duration] -> [mel-spectrogram] (Acoustic model)
|
|
||||||
[mel-spectrogram] -> [predicted F0] (Pitch extractor)
|
|
||||||
[mel-spectrogram] + [predicted F0] -> [waveform] (Vocoder)
|
|
||||||
```
|
|
||||||
|
|
||||||
Click here for detailed instructions: [link](README-SVS-opencpop-e2e.md).
|
|
||||||
|
|
||||||
### FAQ
|
|
||||||
Q1: Why do I need F0 in Vocoders?
|
|
||||||
|
|
||||||
A1: See vocoder parts in HiFiSinger, DiffSinger or SingGAN. This is a common practice now.
|
|
||||||
|
|
||||||
Q2: Why not run MIDI version SVS on PopCS dataset? or Why not release MIDI labels for PopCS dataset?
|
|
||||||
|
|
||||||
A2: Our laboratory has no funds to label PopCS dataset. But there are funds for labeling other singing dataset, which is coming soon.
|
|
||||||
|
|
||||||
Q3: Why " 'HifiGAN' object has no attribute 'model' "?
|
|
||||||
|
|
||||||
A3: Please put the pretrained vocoders in your `checkpoints` dictionary.
|
|
||||||
|
|
||||||
Q4: How to check whether I use GT information or predicted information during inference from packed test set?
|
|
||||||
|
|
||||||
A4: Please see codes [here](https://github.com/MoonInTheRiver/DiffSinger/blob/55e2f46068af6e69940a9f8f02d306c24a940cab/tasks/tts/fs2.py#L343).
|
|
||||||
|
|
||||||
...
|
|
||||||
@@ -1,69 +0,0 @@
|
|||||||
# DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
|
|
||||||
[](https://arxiv.org/abs/2105.02446)
|
|
||||||
[](https://github.com/MoonInTheRiver/DiffSinger)
|
|
||||||
[](https://github.com/MoonInTheRiver/DiffSinger/releases)
|
|
||||||
| [Interactive🤗 TTS](https://huggingface.co/spaces/NATSpeech/DiffSpeech)
|
|
||||||
|
|
||||||
## DiffSpeech (TTS)
|
|
||||||
### 1. Preparation
|
|
||||||
|
|
||||||
#### Data Preparation
|
|
||||||
a) Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/), then create a link to the dataset folder: `ln -s /xxx/LJSpeech-1.1/ data/raw/`
|
|
||||||
|
|
||||||
b) Download and Unzip the [ground-truth duration](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/mfa_outputs.tar) extracted by [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz): `tar -xvf mfa_outputs.tar; mv mfa_outputs data/processed/ljspeech/`
|
|
||||||
|
|
||||||
c) Run the following scripts to pack the dataset for training/inference.
|
|
||||||
|
|
||||||
```sh
|
|
||||||
export PYTHONPATH=.
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config configs/tts/lj/fs2.yaml
|
|
||||||
|
|
||||||
# `data/binary/ljspeech` will be generated.
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Vocoder Preparation
|
|
||||||
We provide the pre-trained model of [HifiGAN](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0414_hifi_lj_1.zip) vocoder.
|
|
||||||
Please unzip this file into `checkpoints` before training your acoustic model.
|
|
||||||
|
|
||||||
### 2. Training Example
|
|
||||||
|
|
||||||
First, you need a pre-trained FastSpeech2 checkpoint. You can use the [pre-trained model](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip), or train FastSpeech2 from scratch, run:
|
|
||||||
```sh
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config configs/tts/lj/fs2.yaml --exp_name fs2_lj_1 --reset
|
|
||||||
```
|
|
||||||
Then, to train DiffSpeech, run:
|
|
||||||
```sh
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset
|
|
||||||
```
|
|
||||||
|
|
||||||
Remember to adjust the "fs2_ckpt" parameter in `usr/configs/lj_ds_beta6.yaml` to fit your path.
|
|
||||||
|
|
||||||
### 3. Inference Example
|
|
||||||
|
|
||||||
```sh
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset --infer
|
|
||||||
```
|
|
||||||
|
|
||||||
We also provide:
|
|
||||||
- the pre-trained model of [DiffSpeech](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/lj_ds_beta6_1213.zip);
|
|
||||||
- the individual pre-trained model of [FastSpeech 2](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip) for the shallow diffusion mechanism in DiffSpeech;
|
|
||||||
|
|
||||||
Remember to put the pre-trained models in `checkpoints` directory.
|
|
||||||
|
|
||||||
## Mel Visualization
|
|
||||||
Along vertical axis, DiffSpeech: [0-80]; FastSpeech2: [80-160].
|
|
||||||
|
|
||||||
<table style="width:100%">
|
|
||||||
<tr>
|
|
||||||
<th>DiffSpeech vs. FastSpeech 2</th>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><img src="resources/diffspeech-fs2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><img src="resources/diffspeech-fs2-1.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><img src="resources/diffspeech-fs2-2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
@@ -1,212 +0,0 @@
|
|||||||
# DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
|
|
||||||
[](https://arxiv.org/abs/2105.02446)
|
|
||||||
[](https://github.com/MoonInTheRiver/DiffSinger)
|
|
||||||
[](https://github.com/MoonInTheRiver/DiffSinger/releases)
|
|
||||||
| [](https://huggingface.co/spaces/NATSpeech/DiffSpeech)
|
|
||||||
| [English README](../README.md)
|
|
||||||
|
|
||||||
本仓库包含了我们的AAAI-2022 [论文](https://arxiv.org/abs/2105.02446)中提出的DiffSpeech (用于语音合成) 与 DiffSinger (用于歌声合成) 的官方Pytorch实现。
|
|
||||||
|
|
||||||
<table style="width:100%">
|
|
||||||
<tr>
|
|
||||||
<th>DiffSinger/DiffSpeech训练阶段</th>
|
|
||||||
<th>DiffSinger/DiffSpeech推理阶段</th>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><img src="resources/model_a.png" alt="Training" height="300"></td>
|
|
||||||
<td><img src="resources/model_b.png" alt="Inference" height="300"></td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
:tada: :tada: :tada: **一些重要更新**:
|
|
||||||
- Mar.2, 2022: [MIDI-新版](README-SVS-opencpop-e2e.md): 重大更新 :sparkles:
|
|
||||||
- Mar.1, 2022: [NeuralSVB](https://github.com/MoonInTheRiver/NeuralSVB), 为了歌声美化任务的代码,开源了 :sparkles: :sparkles: :sparkles: .
|
|
||||||
- Feb.13, 2022: [NATSpeech](https://github.com/NATSpeech/NATSpeech), 一个升级后的代码框架, 包含了DiffSpeech和我们NeurIPS-2021的工作[PortaSpeech](https://openreview.net/forum?id=xmJsuh8xlq) 已经开源! :sparkles: :sparkles: :sparkles:.
|
|
||||||
- Jan.29, 2022: 支持了[MIDI-旧版](README-SVS-opencpop-cascade.md) 版本的歌声合成系统.
|
|
||||||
- Jan.13, 2022: 支持了歌声合成系统, 开源了PopCS数据集.
|
|
||||||
- Dec.19, 2021: 支持了语音合成系统. [HuggingFace🤗 Demo](https://huggingface.co/spaces/NATSpeech/DiffSpeech)
|
|
||||||
|
|
||||||
:rocket: **新闻**:
|
|
||||||
- Feb.24, 2022: 我们的新工作`NeuralSVB` 被 ACL-2022 接收 [](https://arxiv.org/abs/2202.13277). [音频演示](https://neuralsvb.github.io).
|
|
||||||
- Dec.01, 2021: DiffSinger被AAAI-2022接收.
|
|
||||||
- Sep.29, 2021: 我们的新工作`PortaSpeech: Portable and High-Quality Generative Text-to-Speech` 被NeurIPS-2021接收 [](https://arxiv.org/abs/2109.15166) .
|
|
||||||
- May.06, 2021: 我们把这篇DiffSinger提交到了公开论文网站: Arxiv [](https://arxiv.org/abs/2105.02446).
|
|
||||||
|
|
||||||
## 安装依赖
|
|
||||||
```sh
|
|
||||||
conda create -n your_env_name python=3.8
|
|
||||||
source activate your_env_name
|
|
||||||
pip install -r requirements_2080.txt (GPU 2080Ti, CUDA 10.2)
|
|
||||||
or pip install -r requirements_3090.txt (GPU 3090, CUDA 11.4)
|
|
||||||
```
|
|
||||||
|
|
||||||
## DiffSpeech (语音合成的版本)
|
|
||||||
### 1. 准备工作
|
|
||||||
|
|
||||||
#### 数据准备
|
|
||||||
a) 下载并解压 [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/), 创建软链接: `ln -s /xxx/LJSpeech-1.1/ data/raw/`
|
|
||||||
|
|
||||||
b) 下载并解压 [我们用MFA预处理好的对齐](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/mfa_outputs.tar): `tar -xvf mfa_outputs.tar; mv mfa_outputs data/processed/ljspeech/`
|
|
||||||
|
|
||||||
c) 按照如下脚本给数据集打包,打包后的二进制文件用于后续的训练和推理.
|
|
||||||
|
|
||||||
```sh
|
|
||||||
export PYTHONPATH=.
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config configs/tts/lj/fs2.yaml
|
|
||||||
|
|
||||||
# `data/binary/ljspeech` will be generated.
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 声码器准备
|
|
||||||
我们提供了[HifiGAN](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0414_hifi_lj_1.zip)声码器的预训练模型.
|
|
||||||
请在训练声学模型前,先把声码器文件解压到`checkpoints`里。
|
|
||||||
|
|
||||||
### 2. 训练样例
|
|
||||||
|
|
||||||
首先你需要一个预训练好的FastSpeech2存档点. 你可以用[我们预训练好的模型](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip), 或者跑下面这个指令从零开始训练FastSpeech2:
|
|
||||||
```sh
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config configs/tts/lj/fs2.yaml --exp_name fs2_lj_1 --reset
|
|
||||||
```
|
|
||||||
然后为了训练DiffSpeech, 运行:
|
|
||||||
```sh
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset
|
|
||||||
```
|
|
||||||
|
|
||||||
记得针对你的路径修改`usr/configs/lj_ds_beta6.yaml`里"fs2_ckpt"这个参数.
|
|
||||||
|
|
||||||
### 3. 推理样例
|
|
||||||
|
|
||||||
```sh
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset --infer
|
|
||||||
```
|
|
||||||
|
|
||||||
我们也提供了:
|
|
||||||
- [DiffSpeech](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/lj_ds_beta6_1213.zip)的预训练模型;
|
|
||||||
- [FastSpeech 2](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip)的预训练模型, 这是为了DiffSpeech里的浅扩散机制;
|
|
||||||
|
|
||||||
记得把预训练模型放在 `checkpoints` 目录.
|
|
||||||
|
|
||||||
## DiffSinger (歌声合成的版本)
|
|
||||||
|
|
||||||
### 0. 数据获取
|
|
||||||
- 见 [申请表](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md).
|
|
||||||
- 数据集 [预览](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_preview.zip).
|
|
||||||
|
|
||||||
### 1. Preparation
|
|
||||||
#### 数据准备
|
|
||||||
a) 下载并解压PopCS, 创建软链接: `ln -s /xxx/popcs/ data/processed/popcs`
|
|
||||||
|
|
||||||
b) 按照如下脚本给数据集打包,打包后的二进制文件用于后续的训练和推理.
|
|
||||||
```sh
|
|
||||||
export PYTHONPATH=.
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/popcs_ds_beta6.yaml
|
|
||||||
# `data/binary/popcs-pmf0` 会生成出来.
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 声码器准备
|
|
||||||
我们提供了[HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip)的预训练模型, 它专门为了歌声合成系统设计, 采用了NSF的技术。
|
|
||||||
请在训练声学模型前,先把声码器文件解压到`checkpoints`里。
|
|
||||||
|
|
||||||
(更新: 你也可以将我们提供的[训练更多步数的存档点](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt)放到声码器的文件夹里)
|
|
||||||
|
|
||||||
这个声码器是在大约70小时的较大数据集上训练的, 可以被认为是一个通用声码器。
|
|
||||||
|
|
||||||
### 2. 训练样例
|
|
||||||
首先你需要一个预训练好的FFT-Singer. 你可以用[我们预训练好的模型](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip), 或者用如下脚本从零训练FFT-Singer:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
# First, train fft-singer;
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset
|
|
||||||
# Then, infer fft-singer;
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset --infer
|
|
||||||
```
|
|
||||||
|
|
||||||
然后, 为了训练DiffSinger, 运行:
|
|
||||||
```sh
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset
|
|
||||||
```
|
|
||||||
|
|
||||||
记得针对你的路径修改`usr/configs/popcs_ds_beta6_offline.yaml`里"fs2_ckpt"这个参数.
|
|
||||||
|
|
||||||
### 3. 推理样例
|
|
||||||
```sh
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset --infer
|
|
||||||
```
|
|
||||||
|
|
||||||
我们也提供了:
|
|
||||||
- [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_ds_beta6_offline_pmf0_1230.zip)的预训练模型;
|
|
||||||
- [FFT-Singer](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip)的预训练模型, 这是为了DiffSinger里的浅扩散机制;
|
|
||||||
|
|
||||||
记得把预训练模型放在 `checkpoints` 目录.
|
|
||||||
|
|
||||||
*请注意:*
|
|
||||||
|
|
||||||
-*我们原始论文中的PWG版本声码器已投入商业使用,因此我们提供此HifiGAN版本声码器作为替代品。*
|
|
||||||
|
|
||||||
-*我们这篇论文假设提供真实的F0来进行实验,如[1][2][3]等前作所做的那样,重点在频谱建模上,而非F0曲线的预测。如果你想对MIDI数据进行实验,从MIDI和歌词预测F0曲线(显式或隐式),请查看文档[MIDI-old-version](README-SVS-opencpop-cascade.md) 或 [MIDI-new-version](README-SVS-opencpop-e2e.md)。目前已经支持的MIDI数据集有: Opencpop*
|
|
||||||
|
|
||||||
[1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020.
|
|
||||||
|
|
||||||
[2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020.
|
|
||||||
|
|
||||||
[3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020.
|
|
||||||
|
|
||||||
## Tensorboard
|
|
||||||
```sh
|
|
||||||
tensorboard --logdir_spec exp_name
|
|
||||||
```
|
|
||||||
<table style="width:100%">
|
|
||||||
<tr>
|
|
||||||
<td><img src="resources/tfb.png" alt="Tensorboard" height="250"></td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
## Mel 可视化
|
|
||||||
沿着纵轴, DiffSpeech: [0-80]; FastSpeech2: [80-160].
|
|
||||||
|
|
||||||
<table style="width:100%">
|
|
||||||
<tr>
|
|
||||||
<th>DiffSpeech vs. FastSpeech 2</th>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><img src="resources/diffspeech-fs2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><img src="resources/diffspeech-fs2-1.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><img src="resources/diffspeech-fs2-2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
## Audio Demos
|
|
||||||
音频样本可以看我们的[样例页](https://diffsinger.github.io/).
|
|
||||||
|
|
||||||
我们也放了部分由DiffSpeech+HifiGAN (标记为[P]) 和 GTmel+HifiGAN (标记为[G]) 生成的测试集音频样例在:[resources/demos_1213](../resources/demos_1213).
|
|
||||||
|
|
||||||
(对应这个预训练参数:[DiffSpeech](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/lj_ds_beta6_1213.zip))
|
|
||||||
|
|
||||||
---
|
|
||||||
:rocket: :rocket: :rocket: **更新:**
|
|
||||||
|
|
||||||
新生成的歌声样例在:[resources/demos_0112](../resources/demos_0112).
|
|
||||||
|
|
||||||
## Citation
|
|
||||||
如果本仓库对你的研究和工作有用,请引用以下论文:
|
|
||||||
|
|
||||||
@article{liu2021diffsinger,
|
|
||||||
title={Diffsinger: Singing voice synthesis via shallow diffusion mechanism},
|
|
||||||
author={Liu, Jinglin and Li, Chengxi and Ren, Yi and Chen, Feiyang and Liu, Peng and Zhao, Zhou},
|
|
||||||
journal={arXiv preprint arXiv:2105.02446},
|
|
||||||
volume={2},
|
|
||||||
year={2021}}
|
|
||||||
|
|
||||||
|
|
||||||
## 鸣谢
|
|
||||||
我们的代码基于如下仓库:
|
|
||||||
* [denoising-diffusion-pytorch](https://github.com/lucidrains/denoising-diffusion-pytorch)
|
|
||||||
* [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning)
|
|
||||||
* [ParallelWaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)
|
|
||||||
* [HifiGAN](https://github.com/jik876/hifi-gan)
|
|
||||||
* [espnet](https://github.com/espnet/espnet)
|
|
||||||
* [DiffWave](https://github.com/lmnt-com/diffwave)
|
|
||||||
Binary file not shown.
Binary file not shown.
@@ -1 +0,0 @@
|
|||||||
libsndfile1
|
|
||||||
@@ -1,118 +0,0 @@
|
|||||||
absl-py==0.11.0
|
|
||||||
alignment==1.0.10
|
|
||||||
altgraph==0.17
|
|
||||||
appdirs==1.4.4
|
|
||||||
async-timeout==3.0.1
|
|
||||||
audioread==2.1.9
|
|
||||||
backcall==0.2.0
|
|
||||||
blinker==1.4
|
|
||||||
brotlipy==0.7.0
|
|
||||||
cachetools==4.2.0
|
|
||||||
certifi==2020.12.5
|
|
||||||
cffi==1.14.4
|
|
||||||
chardet==4.0.0
|
|
||||||
click==7.1.2
|
|
||||||
cycler==0.10.0
|
|
||||||
Cython==0.29.21
|
|
||||||
cytoolz==0.11.0
|
|
||||||
decorator==4.4.2
|
|
||||||
Distance==0.1.3
|
|
||||||
einops==0.3.0
|
|
||||||
et-xmlfile==1.0.1
|
|
||||||
fsspec==0.8.4
|
|
||||||
future==0.18.2
|
|
||||||
g2p-en==2.1.0
|
|
||||||
g2pM==0.1.2.5
|
|
||||||
google-auth==1.24.0
|
|
||||||
google-auth-oauthlib==0.4.2
|
|
||||||
grpcio==1.34.0
|
|
||||||
h5py==3.1.0
|
|
||||||
horology==1.1.0
|
|
||||||
httplib2==0.18.1
|
|
||||||
idna==2.10
|
|
||||||
imageio==2.9.0
|
|
||||||
inflect==5.0.2
|
|
||||||
ipdb==0.13.4
|
|
||||||
ipython==7.19.0
|
|
||||||
ipython-genutils==0.2.0
|
|
||||||
jdcal==1.4.1
|
|
||||||
jedi==0.17.2
|
|
||||||
jieba==0.42.1
|
|
||||||
jiwer==2.2.0
|
|
||||||
joblib==1.0.0
|
|
||||||
kiwisolver==1.3.1
|
|
||||||
librosa==0.8.0
|
|
||||||
llvmlite==0.31.0
|
|
||||||
Markdown==3.3.3
|
|
||||||
matplotlib==3.3.3
|
|
||||||
miditoolkit==0.1.7
|
|
||||||
mido==1.2.9
|
|
||||||
music21==5.7.2
|
|
||||||
networkx==2.5
|
|
||||||
nltk==3.5
|
|
||||||
numba==0.48.0
|
|
||||||
numpy==1.19.4
|
|
||||||
oauth2client==4.1.3
|
|
||||||
oauthlib==3.1.0
|
|
||||||
olefile==0.46
|
|
||||||
packaging==20.7
|
|
||||||
pandas==1.2.0
|
|
||||||
parso==0.7.1
|
|
||||||
patsy==0.5.1
|
|
||||||
pexpect==4.8.0
|
|
||||||
pickleshare==0.7.5
|
|
||||||
Pillow==8.0.1
|
|
||||||
pooch==1.3.0
|
|
||||||
praat-parselmouth==0.3.3
|
|
||||||
prompt-toolkit==3.0.8
|
|
||||||
protobuf==3.13.0
|
|
||||||
ptyprocess==0.6.0
|
|
||||||
pyasn1==0.4.8
|
|
||||||
pyasn1-modules==0.2.8
|
|
||||||
pycparser==2.20
|
|
||||||
pycwt==0.3.0a22
|
|
||||||
Pygments==2.7.3
|
|
||||||
PyInstaller==3.6
|
|
||||||
PyJWT==1.7.1
|
|
||||||
pyloudnorm==0.1.0
|
|
||||||
pyparsing==2.4.7
|
|
||||||
pypinyin==0.39.0
|
|
||||||
PySocks==1.7.1
|
|
||||||
python-dateutil==2.8.1
|
|
||||||
python-Levenshtein==0.12.0
|
|
||||||
pytorch-lightning==0.7.1
|
|
||||||
pytz==2020.5
|
|
||||||
PyWavelets==1.1.1
|
|
||||||
pyworld==0.2.12
|
|
||||||
PyYAML==5.3.1
|
|
||||||
regex==2020.11.13
|
|
||||||
requests==2.25.1
|
|
||||||
requests-oauthlib==1.3.0
|
|
||||||
resampy==0.2.2
|
|
||||||
Resemblyzer==0.1.1.dev0
|
|
||||||
rsa==4.6
|
|
||||||
scikit-image==0.16.2
|
|
||||||
scikit-learn==0.22.2.post1
|
|
||||||
scipy==1.5.4
|
|
||||||
six==1.15.0
|
|
||||||
SoundFile==0.10.3.post1
|
|
||||||
stopit==1.1.1
|
|
||||||
tensorboard==2.4.0
|
|
||||||
tensorboard-plugin-wit==1.7.0
|
|
||||||
tensorboardX==2.1
|
|
||||||
TextGrid==1.5
|
|
||||||
threadpoolctl==2.1.0
|
|
||||||
toolz==0.11.1
|
|
||||||
torch==1.6.0
|
|
||||||
torchaudio==0.6.0
|
|
||||||
torchvision==0.7.0
|
|
||||||
tqdm==4.54.1
|
|
||||||
traitlets==5.0.5
|
|
||||||
typing==3.7.4.3
|
|
||||||
urllib3==1.26.2
|
|
||||||
uuid==1.30
|
|
||||||
wcwidth==0.2.5
|
|
||||||
webencodings==0.5.1
|
|
||||||
webrtcvad==2.0.10
|
|
||||||
Werkzeug==1.0.1
|
|
||||||
pretty-midi==0.2.9
|
|
||||||
@@ -1,76 +0,0 @@
|
|||||||
absl-py==0.15.0
|
|
||||||
appdirs==1.4.4
|
|
||||||
audioread==2.1.9
|
|
||||||
beautifulsoup4==4.10.0
|
|
||||||
certifi==2021.10.8
|
|
||||||
cffi==1.15.0
|
|
||||||
charset-normalizer==2.0.7
|
|
||||||
cycler==0.11.0
|
|
||||||
Cython==0.29.24
|
|
||||||
decorator==4.4.2
|
|
||||||
dlib==19.22.1
|
|
||||||
einops==0.3.2
|
|
||||||
future==0.18.2
|
|
||||||
g2p-en==2.1.0
|
|
||||||
google==3.0.0
|
|
||||||
grpcio==1.42.0
|
|
||||||
h5py==2.8.0
|
|
||||||
horology==1.2.0
|
|
||||||
idna==3.3
|
|
||||||
imageio==2.10.1
|
|
||||||
imageio-ffmpeg==0.4.5
|
|
||||||
importlib-metadata==4.8.1
|
|
||||||
joblib==1.1.0
|
|
||||||
kiwisolver==1.3.2
|
|
||||||
librosa==0.8.0
|
|
||||||
llvmlite==0.31.0
|
|
||||||
Markdown==3.3.4
|
|
||||||
matplotlib==3.4.3
|
|
||||||
miditoolkit==0.1.7
|
|
||||||
moviepy==1.0.3
|
|
||||||
numba==0.48.0
|
|
||||||
numpy==1.20.0
|
|
||||||
opencv-python==4.5.4.58
|
|
||||||
packaging==21.2
|
|
||||||
pandas==1.3.4
|
|
||||||
Pillow==8.4.0
|
|
||||||
pooch==1.5.2
|
|
||||||
praat-parselmouth==0.3.3
|
|
||||||
proglog==0.1.9
|
|
||||||
protobuf==3.19.1
|
|
||||||
pycparser==2.20
|
|
||||||
pycwt==0.3.0a22
|
|
||||||
pydub==0.25.1
|
|
||||||
pyloudnorm==0.1.0
|
|
||||||
pyparsing==2.4.7
|
|
||||||
pypinyin==0.43.0
|
|
||||||
python-dateutil==2.8.2
|
|
||||||
pytorch-lightning==0.7.1
|
|
||||||
pytorch-ssim==0.1
|
|
||||||
pytz==2021.3
|
|
||||||
pyworld==0.3.0
|
|
||||||
PyYAML==6.0
|
|
||||||
requests==2.26.0
|
|
||||||
resampy==0.2.2
|
|
||||||
Resemblyzer==0.1.1.dev0
|
|
||||||
scikit-image==0.16.2
|
|
||||||
scikit-learn==0.22
|
|
||||||
scipy==1.3.0
|
|
||||||
six==1.16.0
|
|
||||||
sklearn==0.0
|
|
||||||
SoundFile==0.10.3.post1
|
|
||||||
soupsieve==2.3
|
|
||||||
sympy==1.9
|
|
||||||
tensorboard==1.15.0
|
|
||||||
tensorboardX==2.4
|
|
||||||
test-tube==0.7.5
|
|
||||||
TextGrid==1.5
|
|
||||||
torch @ https://download.pytorch.org/whl/nightly/cu113/torch-1.10.0.dev20210907%2Bcu113-cp37-cp37m-linux_x86_64.whl
|
|
||||||
torchvision==0.9.1
|
|
||||||
tqdm==4.62.3
|
|
||||||
typing-extensions==3.10.0.2
|
|
||||||
urllib3==1.26.7
|
|
||||||
uuid==1.30
|
|
||||||
webrtcvad==2.0.10
|
|
||||||
Werkzeug==2.0.2
|
|
||||||
zipp==3.6.0
|
|
||||||
Reference in New Issue
Block a user