update t2s

2025-12-16 20:07:58 +01:00 · 2023-03-25 21:45:49 +08:00
parent 1eaf0ae017
commit 8de9ce6595
23 changed files with 11 additions and 28220 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,7 @@
 .circleci/
 # Byte-compiled / optimized / DLL files
-__pycache__/
+*__pycache__/
 *.py[cod]
 *$py.class
--- a/assets/PROMPT.md
+++ b/assets/PROMPT.md
@@ -17,7 +17,14 @@ Input Example : Generate the audio of this image<br />
 Output:<br />
 ![](i2a-2.png)<br />
 ## ASR
-First uploag your audio(.wav)<br />
+First upload your audio(.wav)<br />
 Input Example : Generate the text of this audio<br />
 Output:<br />
 ![](asr.png)<br />
 ## Style Transfer Text-To-Speech
 First upload your audio(.wav)<br />
 Input Example : Speak using the voice of this audio. The text is "here we go".<br />
 Output:<br />
 ![](style_transfer_tts.png)<br />
--- a/assets/style_transfer_tts.png
+++ b/assets/style_transfer_tts.png
--- a/audio-chatgpt.py
+++ b/audio-chatgpt.py
@@ -517,8 +517,8 @@ class ConversationBot:
            Tool(
                name="Generate speech with unseen style derived from a reference audio acoustic reference from user input text and save it to a file", func= self.tts_ood.inference,
                description="useful for when you want to generate high-quality speech samples with unseen styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
-                            "Like: generate a speech with unseen style derived from this custom voice. The text is xxx."
+                            "Like: Generate a speech with unseen style derived from this custom voice. The text is xxx."
-                            "Or speak using the voice of this audio. The text is xxx."
+                            "Or Speak using the voice of this audio. The text is xxx."
                            "The input to this tool should be a comma seperated string of two, representing reference audio path and input text."),
            Tool(name="Generate singing voice From User Input Text, Note and Duration Sequence", func= self.t2s.inference,
                 description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) and save it to a file."
--- a/text_to_sing/DiffSinger/data/processed/ljspeech/dict.txt
+++ b/text_to_sing/DiffSinger/data/processed/ljspeech/dict.txt
@@ -1,77 +0,0 @@
 ! !
 , ,
 . .
 ; ;
 <BOS> <BOS>
 <EOS> <EOS>
 ? ?
 AA0 AA0
 AA1 AA1
 AA2 AA2
 AE0 AE0
 AE1 AE1
 AE2 AE2
 AH0 AH0
 AH1 AH1
 AH2 AH2
 AO0 AO0
 AO1 AO1
 AO2 AO2
 AW0 AW0
 AW1 AW1
 AW2 AW2
 AY0 AY0
 AY1 AY1
 AY2 AY2
 B B
 CH CH
 D D
 DH DH
 EH0 EH0
 EH1 EH1
 EH2 EH2
 ER0 ER0
 ER1 ER1
 ER2 ER2
 EY0 EY0
 EY1 EY1
 EY2 EY2
 F F
 G G
 HH HH
 IH0 IH0
 IH1 IH1
 IH2 IH2
 IY0 IY0
 IY1 IY1
 IY2 IY2
 JH JH
 K K
 L L
 M M
 N N
 NG NG
 OW0 OW0
 OW1 OW1
 OW2 OW2
 OY0 OY0
 OY1 OY1
 OY2 OY2
 P P
 R R
 S S
 SH SH
 T T
 TH TH
 UH0 UH0
 UH1 UH1
 UH2 UH2
 UW0 UW0
 UW1 UW1
 UW2 UW2
 V V
 W W
 Y Y
 Z Z
 ZH ZH
 | |
--- a/text_to_sing/DiffSinger/data/processed/ljspeech/metadata_phone.csv
+++ b/text_to_sing/DiffSinger/data/processed/ljspeech/metadata_phone.csv
--- a/text_to_sing/DiffSinger/data/processed/ljspeech/mfa_dict.txt
+++ b/text_to_sing/DiffSinger/data/processed/ljspeech/mfa_dict.txt
--- a/text_to_sing/DiffSinger/data/processed/ljspeech/phone_set.json
+++ b/text_to_sing/DiffSinger/data/processed/ljspeech/phone_set.json
@@ -1 +0,0 @@
 ["!", ",", ".", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "|"]
--- a/text_to_sing/DiffSinger/data_gen/singing/binarize.py
+++ b/text_to_sing/DiffSinger/data_gen/singing/binarize.py
@@ -1,398 +0,0 @@
 import os
 import random
 from copy import deepcopy
 import pandas as pd
 import logging
 from tqdm import tqdm
 import json
 import glob
 import re
 from resemblyzer import VoiceEncoder
 import traceback
 import numpy as np
 import pretty_midi
 import librosa
 from scipy.interpolate import interp1d
 import torch
 from textgrid import TextGrid
 from utils.hparams import hparams
 from data_gen.tts.data_gen_utils import build_phone_encoder, get_pitch
 from utils.pitch_utils import f0_to_coarse
 from data_gen.tts.base_binarizer import BaseBinarizer, BinarizationError
 from data_gen.tts.binarizer_zh import ZhBinarizer
 from data_gen.tts.txt_processors.zh_g2pM import ALL_YUNMU
 from vocoders.base_vocoder import VOCODERS
 class SingingBinarizer(BaseBinarizer):
    def __init__(self, processed_data_dir=None):
        if processed_data_dir is None:
            processed_data_dir = hparams['processed_data_dir']
        self.processed_data_dirs = processed_data_dir.split(",")
        self.binarization_args = hparams['binarization_args']
        self.pre_align_args = hparams['pre_align_args']
        self.item2txt = {}
        self.item2ph = {}
        self.item2wavfn = {}
        self.item2f0fn = {}
        self.item2tgfn = {}
        self.item2spk = {}
    def split_train_test_set(self, item_names):
        item_names = deepcopy(item_names)
        test_item_names = [x for x in item_names if any([ts in x for ts in hparams['test_prefixes']])]
        train_item_names = [x for x in item_names if x not in set(test_item_names)]
        logging.info("train {}".format(len(train_item_names)))
        logging.info("test {}".format(len(test_item_names)))
        return train_item_names, test_item_names
    def load_meta_data(self):
        for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
            wav_suffix = '_wf0.wav'
            txt_suffix = '.txt'
            ph_suffix = '_ph.txt'
            tg_suffix = '.TextGrid'
            all_wav_pieces = glob.glob(f'{processed_data_dir}/*/*{wav_suffix}')
            for piece_path in all_wav_pieces:
                item_name = raw_item_name = piece_path[len(processed_data_dir)+1:].replace('/', '-')[:-len(wav_suffix)]
                if len(self.processed_data_dirs) > 1:
                    item_name = f'ds{ds_id}_{item_name}'
                self.item2txt[item_name] = open(f'{piece_path.replace(wav_suffix, txt_suffix)}').readline()
                self.item2ph[item_name] = open(f'{piece_path.replace(wav_suffix, ph_suffix)}').readline()
                self.item2wavfn[item_name] = piece_path
                self.item2spk[item_name] = re.split('-|#', piece_path.split('/')[-2])[0]
                if len(self.processed_data_dirs) > 1:
                    self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
                self.item2tgfn[item_name] = piece_path.replace(wav_suffix, tg_suffix)
        print('spkers: ', set(self.item2spk.values()))
        self.item_names = sorted(list(self.item2txt.keys()))
        if self.binarization_args['shuffle']:
            random.seed(1234)
            random.shuffle(self.item_names)
        self._train_item_names, self._test_item_names = self.split_train_test_set(self.item_names)
    @property
    def train_item_names(self):
        return self._train_item_names
    @property
    def valid_item_names(self):
        return self._test_item_names
    @property
    def test_item_names(self):
        return self._test_item_names
    def process(self):
        self.load_meta_data()
        os.makedirs(hparams['binary_data_dir'], exist_ok=True)
        self.spk_map = self.build_spk_map()
        print("| spk_map: ", self.spk_map)
        spk_map_fn = f"{hparams['binary_data_dir']}/spk_map.json"
        json.dump(self.spk_map, open(spk_map_fn, 'w'))
        self.phone_encoder = self._phone_encoder()
        self.process_data('valid')
        self.process_data('test')
        self.process_data('train')
    def _phone_encoder(self):
        ph_set_fn = f"{hparams['binary_data_dir']}/phone_set.json"
        ph_set = []
        if hparams['reset_phone_dict'] or not os.path.exists(ph_set_fn):
            for ph_sent in self.item2ph.values():
                ph_set += ph_sent.split(' ')
            ph_set = sorted(set(ph_set))
            json.dump(ph_set, open(ph_set_fn, 'w'))
            print("| Build phone set: ", ph_set)
        else:
            ph_set = json.load(open(ph_set_fn, 'r'))
            print("| Load phone set: ", ph_set)
        return build_phone_encoder(hparams['binary_data_dir'])
    # @staticmethod
    # def get_pitch(wav_fn, spec, res):
    #     wav_suffix = '_wf0.wav'
    #     f0_suffix = '_f0.npy'
    #     f0fn = wav_fn.replace(wav_suffix, f0_suffix)
    #     pitch_info = np.load(f0fn)
    #     f0 = [x[1] for x in pitch_info]
    #     spec_x_coor = np.arange(0, 1, 1 / len(spec))[:len(spec)]
    #     f0_x_coor = np.arange(0, 1, 1 / len(f0))[:len(f0)]
    #     f0 = interp1d(f0_x_coor, f0, 'nearest', fill_value='extrapolate')(spec_x_coor)[:len(spec)]
    #     # f0_x_coor = np.arange(0, 1, 1 / len(f0))
    #     # f0_x_coor[-1] = 1
    #     # f0 = interp1d(f0_x_coor, f0, 'nearest')(spec_x_coor)[:len(spec)]
    #     if sum(f0) == 0:
    #         raise BinarizationError("Empty f0")
    #     assert len(f0) == len(spec), (len(f0), len(spec))
    #     pitch_coarse = f0_to_coarse(f0)
    #
    #     # vis f0
    #     # import matplotlib.pyplot as plt
    #     # from textgrid import TextGrid
    #     # tg_fn = wav_fn.replace(wav_suffix, '.TextGrid')
    #     # fig = plt.figure(figsize=(12, 6))
    #     # plt.pcolor(spec.T, vmin=-5, vmax=0)
    #     # ax = plt.gca()
    #     # ax2 = ax.twinx()
    #     # ax2.plot(f0, color='red')
    #     # ax2.set_ylim(0, 800)
    #     # itvs = TextGrid.fromFile(tg_fn)[0]
    #     # for itv in itvs:
    #     #     x = itv.maxTime * hparams['audio_sample_rate'] / hparams['hop_size']
    #     #     plt.vlines(x=x, ymin=0, ymax=80, color='black')
    #     #     plt.text(x=x, y=20, s=itv.mark, color='black')
    #     # plt.savefig('tmp/20211229_singing_plots_test.png')
    #
    #     res['f0'] = f0
    #     res['pitch'] = pitch_coarse
    @classmethod
    def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
        if hparams['vocoder'] in VOCODERS:
            wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
        else:
            wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
        res = {
            'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
            'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
        }
        try:
            if binarization_args['with_f0']:
                # cls.get_pitch(wav_fn, mel, res)
                cls.get_pitch(wav, mel, res)
            if binarization_args['with_txt']:
                try:
                    # print(ph)
                    phone_encoded = res['phone'] = encoder.encode(ph)
                except:
                    traceback.print_exc()
                    raise BinarizationError(f"Empty phoneme")
                if binarization_args['with_align']:
                    cls.get_align(tg_fn, ph, mel, phone_encoded, res)
        except BinarizationError as e:
            print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
            return None
        return res
 class MidiSingingBinarizer(SingingBinarizer):
    item2midi = {}
    item2midi_dur = {}
    item2is_slur = {}
    item2ph_durs = {}
    item2wdb = {}
    def load_meta_data(self):
        for ds_id, processed_data_dir in enumerate(self.processed_data_dirs):
            meta_midi = json.load(open(os.path.join(processed_data_dir, 'meta.json')))   # [list of dict]
            for song_item in meta_midi:
                item_name = raw_item_name = song_item['item_name']
                if len(self.processed_data_dirs) > 1:
                    item_name = f'ds{ds_id}_{item_name}'
                self.item2wavfn[item_name] = song_item['wav_fn']
                self.item2txt[item_name] = song_item['txt']
                self.item2ph[item_name] = ' '.join(song_item['phs'])
                self.item2wdb[item_name] = [1 if x in ALL_YUNMU + ['AP', 'SP', '<SIL>'] else 0 for x in song_item['phs']]
                self.item2ph_durs[item_name] = song_item['ph_dur']
                self.item2midi[item_name] = song_item['notes']
                self.item2midi_dur[item_name] = song_item['notes_dur']
                self.item2is_slur[item_name] = song_item['is_slur']
                self.item2spk[item_name] = 'pop-cs'
                if len(self.processed_data_dirs) > 1:
                    self.item2spk[item_name] = f"ds{ds_id}_{self.item2spk[item_name]}"
        print('spkers: ', set(self.item2spk.values()))
        self.item_names = sorted(list(self.item2txt.keys()))
        if self.binarization_args['shuffle']:
            random.seed(1234)
            random.shuffle(self.item_names)
        self._train_item_names, self._test_item_names = self.split_train_test_set(self.item_names)
    @staticmethod
    def get_pitch(wav_fn, wav, spec, ph, res):
        wav_suffix = '.wav'
        # midi_suffix = '.mid'
        wav_dir = 'wavs'
        f0_dir = 'f0'
        item_name = '/'.join(os.path.splitext(wav_fn)[0].split('/')[-2:]).replace('_wf0', '')
        res['pitch_midi'] = np.asarray(MidiSingingBinarizer.item2midi[item_name])
        res['midi_dur'] = np.asarray(MidiSingingBinarizer.item2midi_dur[item_name])
        res['is_slur'] = np.asarray(MidiSingingBinarizer.item2is_slur[item_name])
        res['word_boundary'] = np.asarray(MidiSingingBinarizer.item2wdb[item_name])
        assert res['pitch_midi'].shape == res['midi_dur'].shape == res['is_slur'].shape, (
        res['pitch_midi'].shape, res['midi_dur'].shape, res['is_slur'].shape)
        # gt f0.
        gt_f0, gt_pitch_coarse = get_pitch(wav, spec, hparams)
        if sum(gt_f0) == 0:
            raise BinarizationError("Empty **gt** f0")
        res['f0'] = gt_f0
        res['pitch'] = gt_pitch_coarse
    @staticmethod
    def get_align(ph_durs, mel, phone_encoded, res, hop_size=hparams['hop_size'], audio_sample_rate=hparams['audio_sample_rate']):
        mel2ph = np.zeros([mel.shape[0]], int)
        startTime = 0
        for i_ph in range(len(ph_durs)):
            start_frame = int(startTime * audio_sample_rate / hop_size + 0.5)
            end_frame = int((startTime + ph_durs[i_ph]) * audio_sample_rate / hop_size + 0.5)
            mel2ph[start_frame:end_frame] = i_ph + 1
            startTime = startTime + ph_durs[i_ph]
        # print('ph durs: ', ph_durs)
        # print('mel2ph: ', mel2ph, len(mel2ph))
        res['mel2ph'] = mel2ph
        # res['dur'] = None
    @classmethod
    def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
        if hparams['vocoder'] in VOCODERS:
            wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
        else:
            wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
        res = {
            'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
            'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
        }
        try:
            if binarization_args['with_f0']:
                cls.get_pitch(wav_fn, wav, mel, ph, res)
            if binarization_args['with_txt']:
                try:
                    phone_encoded = res['phone'] = encoder.encode(ph)
                except:
                    traceback.print_exc()
                    raise BinarizationError(f"Empty phoneme")
                if binarization_args['with_align']:
                    cls.get_align(MidiSingingBinarizer.item2ph_durs[item_name], mel, phone_encoded, res)
        except BinarizationError as e:
            print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
            return None
        return res
 class ZhSingingBinarizer(ZhBinarizer, SingingBinarizer):
    pass
 class OpencpopBinarizer(MidiSingingBinarizer):
    item2midi = {}
    item2midi_dur = {}
    item2is_slur = {}
    item2ph_durs = {}
    item2wdb = {}
    def split_train_test_set(self, item_names):
        item_names = deepcopy(item_names)
        test_item_names = [x for x in item_names if any([x.startswith(ts) for ts in hparams['test_prefixes']])]
        train_item_names = [x for x in item_names if x not in set(test_item_names)]
        logging.info("train {}".format(len(train_item_names)))
        logging.info("test {}".format(len(test_item_names)))
        return train_item_names, test_item_names
    def load_meta_data(self):
        raw_data_dir = hparams['raw_data_dir']
        # meta_midi = json.load(open(os.path.join(raw_data_dir, 'meta.json')))   # [list of dict]
        utterance_labels = open(os.path.join(raw_data_dir, 'transcriptions.txt')).readlines()
        for utterance_label in utterance_labels:
            song_info = utterance_label.split('|')
            item_name = raw_item_name = song_info[0]
            self.item2wavfn[item_name] = f'{raw_data_dir}/wavs/{item_name}.wav'
            self.item2txt[item_name] = song_info[1]
            self.item2ph[item_name] = song_info[2]
            # self.item2wdb[item_name] = list(np.nonzero([1 if x in ALL_YUNMU + ['AP', 'SP'] else 0 for x in song_info[2].split()])[0])
            self.item2wdb[item_name] = [1 if x in ALL_YUNMU + ['AP', 'SP'] else 0 for x in song_info[2].split()]
            self.item2ph_durs[item_name] = [float(x) for x in song_info[5].split(" ")]
            self.item2midi[item_name] = [librosa.note_to_midi(x.split("/")[0]) if x != 'rest' else 0
                                   for x in song_info[3].split(" ")]
            self.item2midi_dur[item_name] = [float(x) for x in song_info[4].split(" ")]
            self.item2is_slur[item_name] = [int(x) for x in song_info[6].split(" ")]
            self.item2spk[item_name] = 'opencpop'
        print('spkers: ', set(self.item2spk.values()))
        self.item_names = sorted(list(self.item2txt.keys()))
        if self.binarization_args['shuffle']:
            random.seed(1234)
            random.shuffle(self.item_names)
        self._train_item_names, self._test_item_names = self.split_train_test_set(self.item_names)
    @staticmethod
    def get_pitch(wav_fn, wav, spec, ph, res):
        wav_suffix = '.wav'
        # midi_suffix = '.mid'
        wav_dir = 'wavs'
        f0_dir = 'text_f0_align'
        item_name = os.path.splitext(os.path.basename(wav_fn))[0]
        res['pitch_midi'] = np.asarray(OpencpopBinarizer.item2midi[item_name])
        res['midi_dur'] = np.asarray(OpencpopBinarizer.item2midi_dur[item_name])
        res['is_slur'] = np.asarray(OpencpopBinarizer.item2is_slur[item_name])
        res['word_boundary'] = np.asarray(OpencpopBinarizer.item2wdb[item_name])
        assert res['pitch_midi'].shape == res['midi_dur'].shape == res['is_slur'].shape, (res['pitch_midi'].shape, res['midi_dur'].shape, res['is_slur'].shape)
        # gt f0.
        # f0 = None
        # f0_suffix = '_f0.npy'
        # f0fn = wav_fn.replace(wav_suffix, f0_suffix).replace(wav_dir, f0_dir)
        # pitch_info = np.load(f0fn)
        # f0 = [x[1] for x in pitch_info]
        # spec_x_coor = np.arange(0, 1, 1 / len(spec))[:len(spec)]
        #
        # f0_x_coor = np.arange(0, 1, 1 / len(f0))[:len(f0)]
        # f0 = interp1d(f0_x_coor, f0, 'nearest', fill_value='extrapolate')(spec_x_coor)[:len(spec)]
        # if sum(f0) == 0:
        #     raise BinarizationError("Empty **gt** f0")
        #
        # pitch_coarse = f0_to_coarse(f0)
        # res['f0'] = f0
        # res['pitch'] = pitch_coarse
        # gt f0.
        gt_f0, gt_pitch_coarse = get_pitch(wav, spec, hparams)
        if sum(gt_f0) == 0:
            raise BinarizationError("Empty **gt** f0")
        res['f0'] = gt_f0
        res['pitch'] = gt_pitch_coarse
    @classmethod
    def process_item(cls, item_name, ph, txt, tg_fn, wav_fn, spk_id, encoder, binarization_args):
        if hparams['vocoder'] in VOCODERS:
            wav, mel = VOCODERS[hparams['vocoder']].wav2spec(wav_fn)
        else:
            wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(wav_fn)
        res = {
            'item_name': item_name, 'txt': txt, 'ph': ph, 'mel': mel, 'wav': wav, 'wav_fn': wav_fn,
            'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0], 'spk_id': spk_id
        }
        try:
            if binarization_args['with_f0']:
                cls.get_pitch(wav_fn, wav, mel, ph, res)
            if binarization_args['with_txt']:
                try:
                    phone_encoded = res['phone'] = encoder.encode(ph)
                except:
                    traceback.print_exc()
                    raise BinarizationError(f"Empty phoneme")
                if binarization_args['with_align']:
                    cls.get_align(OpencpopBinarizer.item2ph_durs[item_name], mel, phone_encoded, res)
        except BinarizationError as e:
            print(f"| Skip item ({e}). item_name: {item_name}, wav_fn: {wav_fn}")
            return None
        return res
 if __name__ == "__main__":
    SingingBinarizer().process()
--- a/text_to_sing/DiffSinger/data_gen/tts/bin/binarize.py
+++ b/text_to_sing/DiffSinger/data_gen/tts/bin/binarize.py
@@ -1,20 +0,0 @@
 import os
 os.environ["OMP_NUM_THREADS"] = "1"
 import importlib
 from utils.hparams import set_hparams, hparams
 def binarize():
    binarizer_cls = hparams.get("binarizer_cls", 'data_gen.tts.base_binarizer.BaseBinarizer')
    pkg = ".".join(binarizer_cls.split(".")[:-1])
    cls_name = binarizer_cls.split(".")[-1]
    binarizer_cls = getattr(importlib.import_module(pkg), cls_name)
    print("| Binarizer: ", binarizer_cls)
    binarizer_cls().process()
 if __name__ == '__main__':
    set_hparams()
    binarize()
--- a/text_to_sing/DiffSinger/data_gen/tts/bin/preprocess.py
+++ b/text_to_sing/DiffSinger/data_gen/tts/bin/preprocess.py
@@ -1,17 +0,0 @@
 from utils.hparams import set_hparams, hparams
 import importlib
 def preprocess():
    assert hparams['preprocess_cls'] != ''
    pkg = ".".join(hparams["preprocess_cls"].split(".")[:-1])
    cls_name = hparams["preprocess_cls"].split(".")[-1]
    process_cls = getattr(importlib.import_module(pkg), cls_name)
    process_cls().process()
 if __name__ == '__main__':
    set_hparams()
    preprocess()
--- a/text_to_sing/DiffSinger/data_gen/tts/bin/train_mfa_align.py
+++ b/text_to_sing/DiffSinger/data_gen/tts/bin/train_mfa_align.py
@@ -1,15 +0,0 @@
 import subprocess
 from utils.hparams import hparams, set_hparams
 import os
 def train_mfa_align():
    CORPUS = hparams['processed_data_dir'].split("/")[-1]
    print(f"| Run MFA for {CORPUS}.")
    NUM_JOB = int(os.getenv('N_PROC', os.cpu_count()))
    subprocess.check_call(f'CORPUS={CORPUS} NUM_JOB={NUM_JOB} bash data_gen/tts/scripts/run_mfa_train_align.sh', shell=True)
 if __name__ == '__main__':
    set_hparams(print_hparams=False)
    train_mfa_align()
--- a/text_to_sing/DiffSinger/docs/README-SVS-opencpop-cascade.md
+++ b/text_to_sing/DiffSinger/docs/README-SVS-opencpop-cascade.md
@@ -1,111 +0,0 @@
 # DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
 [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
 [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
 [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
 ## DiffSinger (MIDI SVS | A version)
 ### 0. Data Acquirement
 For Opencpop dataset: Please strictly follow the instructions of [Opencpop](https://wenet.org.cn/opencpop/). We have no right to give you the access to Opencpop.
 The pipeline below is designed for Opencpop dataset:
 ### 1. Preparation
 #### Data Preparation
 a) Download and extract Opencpop, then create a link to the dataset folder: `ln -s /xxx/opencpop data/raw/`
 b) Run the following scripts to pack the dataset for training/inference.
 ```sh
 export PYTHONPATH=.
 CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml
 # `data/binary/opencpop-midi-dp` will be generated.
 ```
 #### Vocoder Preparation
 We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
 Please unzip this file into `checkpoints` before training your acoustic model.
 (Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
 This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder. 
 #### Exp Name Preparation
 ```bash
 export MY_FS_EXP_NAME=0302_opencpop_fs_midi
 export MY_DS_EXP_NAME=0303_opencpop_ds58_midi
 ```
 ```
 .
 |--data
    |--raw
        |--opencpop
            |--segments
                |--transcriptions.txt
                |--wavs
 |--checkpoints
    |--MY_FS_EXP_NAME (optional)
    |--MY_DS_EXP_NAME (optional)
    |--0109_hifigan_bigpopcs_hop128
        |--model_ckpt_steps_1512000.ckpt
        |--config.yaml
 ```
 ### 2. Training Example
 First, you need a pre-trained FFT-Singer checkpoint. You can use the pre-trained model, or train FFT-Singer from scratch, run:
 ```sh
 CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml --exp_name $MY_FS_EXP_NAME --reset
 ```
 Then, to train DiffSinger, run:
 ```sh
 CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME --reset  
 ```
 Remember to adjust the "fs2_ckpt" parameter in `usr/configs/midi/cascade/opencs/ds60_rel.yaml` to fit your path.
 ### 3. Inference from packed test set
 ```sh
 CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME --reset --infer
 ```
 We also provide:
 - the pre-trained model of DiffSinger;
 - the pre-trained model of FFT-Singer;
 They can be found in [here](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/adjust-receptive-field.zip).
 Remember to put the pre-trained models in `checkpoints` directory.
 ### 4. Inference from raw inputs
 ```sh
 python inference/svs/ds_cascade.py --config usr/configs/midi/cascade/opencs/ds60_rel.yaml --exp_name $MY_DS_EXP_NAME
 ```
 Raw inputs:
 ```
 inp = {
        'text': '小酒窝长睫毛AP是你最美的记号',
        'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
        'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
        'input_type': 'word'
    }  # user input: Chinese characters
 or,
 inp = {
        'text': '小酒窝长睫毛AP是你最美的记号',
        'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
        'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
        'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
        'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
        'input_type': 'phoneme'
    }  # input like Opencpop dataset.
 ```
 ### 5. Some issues.
 a) the HifiGAN-Singing is trained on our [vocoder dataset](https://dl.acm.org/doi/abs/10.1145/3474085.3475437) and the training set of [PopCS](https://arxiv.org/abs/2105.02446). Opencpop is the out-of-domain dataset (unseen speaker). This may cause the deterioration of audio quality, and we are considering fine-tuning this vocoder on the training set of Opencpop.
 b) in this version of codes, we used the melody frontend ([lyric + MIDI]->[F0+ph_dur]) to predict F0 contour and phoneme duration.
 c) generated audio demos can be found in [MY_DS_EXP_NAME](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/adjust-receptive-field.zip).
--- a/text_to_sing/DiffSinger/docs/README-SVS-opencpop-e2e.md
+++ b/text_to_sing/DiffSinger/docs/README-SVS-opencpop-e2e.md
@@ -1,107 +0,0 @@
 # DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
 [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
 [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
 [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
 | [Interactive🤗 SVS](https://huggingface.co/spaces/Silentlin/DiffSinger)
 Substantial update: We 1) **abandon** the explicit prediction of the F0 curve; 2) increase the receptive field of the denoiser; 3) make the linguistic encoder more robust.
 **By doing so, 1) the synthesized recordings are more natural in terms of pitch; 2) the pipeline is simpler.**
 简而言之，把F0曲线的动态性交给生成式模型去捕捉，而不再是以前那样用MSE约束对数域F0。
 ## DiffSinger (MIDI SVS | B version)
 ### 0. Data Acquirement
 For Opencpop dataset: Please strictly follow the instructions of [Opencpop](https://wenet.org.cn/opencpop/). We have no right to give you the access to Opencpop.
 The pipeline below is designed for Opencpop dataset:
 ### 1. Preparation
 #### Data Preparation
 a) Download and extract Opencpop, then create a link to the dataset folder: `ln -s /xxx/opencpop data/raw/`
 b) Run the following scripts to pack the dataset for training/inference.
 ```sh
 export PYTHONPATH=.
 CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/midi/cascade/opencs/aux_rel.yaml
 # `data/binary/opencpop-midi-dp` will be generated.
 ```
 #### Vocoder Preparation
 We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
 Also, please unzip pre-trained vocoder and [this pendant for vocoder](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0102_xiaoma_pe.zip) into `checkpoints` before training your acoustic model.
 (Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
 This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder. 
 #### Exp Name Preparation
 ```bash
 export MY_DS_EXP_NAME=0228_opencpop_ds100_rel
 ```
 ```
 .
 |--data
    |--raw
        |--opencpop
            |--segments
                |--transcriptions.txt
                |--wavs
 |--checkpoints
    |--MY_DS_EXP_NAME (optional)
    |--0109_hifigan_bigpopcs_hop128 (vocoder)
        |--model_ckpt_steps_1512000.ckpt
        |--config.yaml
 ```
 ### 2. Training Example
 ```sh
 CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME --reset  
 ```
 ### 3. Inference from packed test set
 ```sh
 CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME --reset --infer
 ```
 We also provide:
 - the pre-trained model of DiffSinger;
 They can be found in [here](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0228_opencpop_ds100_rel.zip).
 Remember to put the pre-trained models in `checkpoints` directory.
 ### 4. Inference from raw inputs
 ```sh
 python inference/svs/ds_e2e.py --config usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml --exp_name $MY_DS_EXP_NAME
 ```
 Raw inputs:
 ```
 inp = {
        'text': '小酒窝长睫毛AP是你最美的记号',
        'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
        'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
        'input_type': 'word'
    }  # user input: Chinese characters
 or,
 inp = {
        'text': '小酒窝长睫毛AP是你最美的记号',
        'ph_seq': 'x iao j iu w o ch ang ang j ie ie m ao AP sh i n i z ui m ei d e j i h ao',
        'note_seq': 'C#4/Db4 C#4/Db4 F#4/Gb4 F#4/Gb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 F#4/Gb4 F#4/Gb4 F#4/Gb4 C#4/Db4 C#4/Db4 C#4/Db4 rest C#4/Db4 C#4/Db4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 A#4/Bb4 A#4/Bb4 G#4/Ab4 G#4/Ab4 F4 F4 C#4/Db4 C#4/Db4',
        'note_dur_seq': '0.407140 0.407140 0.376190 0.376190 0.242180 0.242180 0.509550 0.509550 0.183420 0.315400 0.315400 0.235020 0.361660 0.361660 0.223070 0.377270 0.377270 0.340550 0.340550 0.299620 0.299620 0.344510 0.344510 0.283770 0.283770 0.323390 0.323390 0.360340 0.360340',
        'is_slur_seq': '0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
        'input_type': 'phoneme'
    }  # input like Opencpop dataset.
 ```
 ### 5. Some issues.
 a) the HifiGAN-Singing is trained on our [vocoder dataset](https://dl.acm.org/doi/abs/10.1145/3474085.3475437) and the training set of [PopCS](https://arxiv.org/abs/2105.02446). Opencpop is the out-of-domain dataset (unseen speaker). This may cause the deterioration of audio quality, and we are considering fine-tuning this vocoder on the training set of Opencpop.
 b) in this version of codes, we used the melody frontend ([lyric + MIDI]->[ph_dur]) to predict phoneme duration. F0 curve is implicitly predicted together with mel-spectrogram.
 c) example [generated audio](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/demos_0221/DS/).
 More generated audio demos can be found in [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0228_opencpop_ds100_rel.zip).
--- a/text_to_sing/DiffSinger/docs/README-SVS-popcs.md
+++ b/text_to_sing/DiffSinger/docs/README-SVS-popcs.md
@@ -1,63 +0,0 @@
 ## DiffSinger (SVS version)
 ### 0. Data Acquirement
 - See in [apply_form](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md).
 - Dataset [preview](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_preview.zip).
 ### 1. Preparation
 #### Data Preparation
 a) Download and extract PopCS, then create a link to the dataset folder: `ln -s /xxx/popcs/ data/processed/popcs`
 b) Run the following scripts to pack the dataset for training/inference.
 ```sh
 export PYTHONPATH=.
 CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/popcs_ds_beta6.yaml
 # `data/binary/popcs-pmf0` will be generated.
 ```
 #### Vocoder Preparation
 We provide the pre-trained model of [HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip) which is specially designed for SVS with NSF mechanism.
 Please unzip this file into `checkpoints` before training your acoustic model.
 (Update: You can also move [a ckpt with more training steps](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt) into this vocoder directory)
 This singing vocoder is trained on ~70 hours singing data, which can be viewed as a universal vocoder. 
 ### 2. Training Example
 First, you need a pre-trained FFT-Singer checkpoint. You can use the [pre-trained model](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip), or train FFT-Singer from scratch, run:
 ```sh
 # First, train fft-singer;
 CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset
 # Then, infer fft-singer;
 CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset --infer 
 ```
 Then, to train DiffSinger, run:
 ```sh
 CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset
 ```
 Remember to adjust the "fs2_ckpt" parameter in `usr/configs/popcs_ds_beta6_offline.yaml` to fit your path.
 ### 3. Inference Example
 ```sh
 CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset --infer
 ```
 We also provide:
 - the pre-trained model of [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_ds_beta6_offline_pmf0_1230.zip);
 - the pre-trained model of [FFT-Singer](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip) for the shallow diffusion mechanism in DiffSinger;
 Remember to put the pre-trained models in `checkpoints` directory.
 *Note that:* 
 - *the original PWG version vocoder in the paper we used has been put into commercial use, so we provide this HifiGAN version vocoder as a substitute.*
 - *we assume the ground-truth F0 to be given as the pitch information following [1][2][3]. If you want to conduct experiments on MIDI data, you need an external F0 predictor (like [MIDI-A-version](README-SVS-opencpop-cascade.md)) or a joint prediction with spectrograms(like [MIDI-B-version](README-SVS-opencpop-e2e.md)).*
 [1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020.
 [2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020.
 [3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020.
--- a/text_to_sing/DiffSinger/docs/README-SVS.md
+++ b/text_to_sing/DiffSinger/docs/README-SVS.md
@@ -1,76 +0,0 @@
 # DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
 [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
 [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
 [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
 | [Interactive🤗 SVS](https://huggingface.co/spaces/Silentlin/DiffSinger)
 ## DiffSinger (SVS)
 ### PART1. [Run DiffSinger on PopCS](README-SVS-popcs.md)
 In PART1, we only focus on spectrum modeling (acoustic model) and assume the ground-truth (GT) F0 to be given as the pitch information following these papers [1][2][3]. If you want to conduct experiments with F0 prediction, please move to PART2.
 Thus, the pipeline of this part can be summarized as:
 ```
 [lyrics] -> [linguistic representation] (Frontend)
 [linguistic representation] + [GT F0] + [GT phoneme duration] -> [mel-spectrogram]  (Acoustic model)
 [mel-spectrogram] + [GT F0] -> [waveform] (Vocoder)
 ```
 [1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020.
 [2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020.
 [3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020.
 Click here for detailed instructions: [link](README-SVS-popcs.md).
 ### PART2. [Run DiffSinger on Opencpop](README-SVS-opencpop-cascade.md)
 Thanks [Opencpop team](https://wenet.org.cn/opencpop/) for releasing their SVS dataset with MIDI label, **Jan.20, 2022** (after we published our paper).
 Since there are elaborately annotated MIDI labels, we are able to supplement the pipeline in PART 1 by adding a naive melody frontend.
 #### 2.A
 Thus, the pipeline of [2.A](README-SVS-opencpop-cascade.md) can be summarized as:
 ```
 [lyrics] + [MIDI] -> [linguistic representation (with MIDI information)] + [predicted F0] + [predicted phoneme duration] (Melody frontend)
 [linguistic representation] + [predicted F0] + [predicted phoneme duration] -> [mel-spectrogram]  (Acoustic model)
 [mel-spectrogram] + [predicted F0] -> [waveform] (Vocoder)
 ```
 Click here for detailed instructions: [link](README-SVS-opencpop-cascade.md).
 #### 2.B
 In 2.1, we find that if we predict F0 explicitly in the melody frontend, there will be many bad cases of uv/v prediction. Then, we abandon the explicit prediction of the F0 curve in the melody frontend and make a joint prediction with spectrograms.
 Thus, the pipeline of [2.B](README-SVS-opencpop-e2e.md) can be summarized as:
 ```
 [lyrics] + [MIDI] -> [linguistic representation] + [predicted phoneme duration] (Melody frontend)
 [linguistic representation (with MIDI information)] + [predicted phoneme duration] -> [mel-spectrogram]  (Acoustic model)
 [mel-spectrogram] -> [predicted F0]  (Pitch extractor)
 [mel-spectrogram] + [predicted F0] -> [waveform] (Vocoder)
 ```
 Click here for detailed instructions: [link](README-SVS-opencpop-e2e.md).
 ### FAQ
 Q1: Why do I need F0 in Vocoders?
 A1: See vocoder parts in HiFiSinger, DiffSinger or SingGAN. This is a common practice now.
 Q2: Why not run MIDI version SVS on PopCS dataset? or Why not release MIDI labels for PopCS dataset?
 A2: Our laboratory has no funds to label PopCS dataset. But there are funds for labeling other singing dataset, which is coming soon.
 Q3: Why " 'HifiGAN' object has no attribute 'model' "?
 A3: Please put the pretrained vocoders in your `checkpoints` dictionary.
 Q4: How to check whether I use GT information or predicted information during inference from packed test set?
 A4: Please see codes [here](https://github.com/MoonInTheRiver/DiffSinger/blob/55e2f46068af6e69940a9f8f02d306c24a940cab/tasks/tts/fs2.py#L343).
 ...
--- a/text_to_sing/DiffSinger/docs/README-TTS.md
+++ b/text_to_sing/DiffSinger/docs/README-TTS.md
@@ -1,69 +0,0 @@
 # DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
 [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
 [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
 [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
 | [Interactive🤗 TTS](https://huggingface.co/spaces/NATSpeech/DiffSpeech)
 ## DiffSpeech (TTS)
 ### 1. Preparation
 #### Data Preparation
 a) Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/), then create a link to the dataset folder: `ln -s /xxx/LJSpeech-1.1/ data/raw/`
 b) Download and Unzip the [ground-truth duration](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/mfa_outputs.tar) extracted by [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz):  `tar -xvf mfa_outputs.tar; mv mfa_outputs data/processed/ljspeech/`
 c) Run the following scripts to pack the dataset for training/inference.
 ```sh
 export PYTHONPATH=.
 CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config configs/tts/lj/fs2.yaml
 # `data/binary/ljspeech` will be generated.
 ```
 #### Vocoder Preparation
 We provide the pre-trained model of [HifiGAN](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0414_hifi_lj_1.zip) vocoder.
 Please unzip this file into `checkpoints` before training your acoustic model.
 ### 2. Training Example
 First, you need a pre-trained FastSpeech2 checkpoint. You can use the [pre-trained model](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip), or train FastSpeech2 from scratch, run:
 ```sh
 CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config configs/tts/lj/fs2.yaml --exp_name fs2_lj_1 --reset
 ```
 Then, to train DiffSpeech, run:
 ```sh
 CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset
 ```
 Remember to adjust the "fs2_ckpt" parameter in `usr/configs/lj_ds_beta6.yaml` to fit your path.
 ### 3. Inference Example
 ```sh
 CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset --infer
 ```
 We also provide:
 - the pre-trained model of [DiffSpeech](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/lj_ds_beta6_1213.zip);
 - the individual pre-trained model of [FastSpeech 2](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip) for the shallow diffusion mechanism in DiffSpeech;
 Remember to put the pre-trained models in `checkpoints` directory.
 ## Mel Visualization
 Along vertical axis, DiffSpeech: [0-80]; FastSpeech2: [80-160].
 <table style="width:100%">
  <tr>
    <th>DiffSpeech vs. FastSpeech 2</th>
  </tr>
  <tr>
    <td><img src="resources/diffspeech-fs2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
  </tr>
  <tr>
    <td><img src="resources/diffspeech-fs2-1.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
  </tr>
  <tr>
    <td><img src="resources/diffspeech-fs2-2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
  </tr>
 </table>
--- a/text_to_sing/DiffSinger/docs/README-zh.md
+++ b/text_to_sing/DiffSinger/docs/README-zh.md
@@ -1,212 +0,0 @@
 # DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism
 [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446)
 [![GitHub Stars](https://img.shields.io/github/stars/MoonInTheRiver/DiffSinger?style=social)](https://github.com/MoonInTheRiver/DiffSinger)
 [![downloads](https://img.shields.io/github/downloads/MoonInTheRiver/DiffSinger/total.svg)](https://github.com/MoonInTheRiver/DiffSinger/releases)
 | [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-blue)](https://huggingface.co/spaces/NATSpeech/DiffSpeech)
 | [English README](../README.md)
 本仓库包含了我们的AAAI-2022 [论文](https://arxiv.org/abs/2105.02446)中提出的DiffSpeech (用于语音合成) 与 DiffSinger (用于歌声合成) 的官方Pytorch实现。
 <table style="width:100%">
  <tr>
    <th>DiffSinger/DiffSpeech训练阶段</th>
    <th>DiffSinger/DiffSpeech推理阶段</th>
  </tr>
  <tr>
    <td><img src="resources/model_a.png" alt="Training" height="300"></td>
    <td><img src="resources/model_b.png" alt="Inference" height="300"></td>
  </tr>
 </table>
 :tada: :tada: :tada: **一些重要更新**:
 - Mar.2, 2022: [MIDI-新版](README-SVS-opencpop-e2e.md): 重大更新 :sparkles:
 - Mar.1, 2022: [NeuralSVB](https://github.com/MoonInTheRiver/NeuralSVB), 为了歌声美化任务的代码，开源了 :sparkles:  :sparkles:  :sparkles: .
 - Feb.13, 2022: [NATSpeech](https://github.com/NATSpeech/NATSpeech), 一个升级后的代码框架, 包含了DiffSpeech和我们NeurIPS-2021的工作[PortaSpeech](https://openreview.net/forum?id=xmJsuh8xlq) 已经开源! :sparkles: :sparkles: :sparkles:. 
 - Jan.29, 2022: 支持了[MIDI-旧版](README-SVS-opencpop-cascade.md) 版本的歌声合成系统.
 - Jan.13, 2022: 支持了歌声合成系统, 开源了PopCS数据集.
 - Dec.19, 2021: 支持了语音合成系统. [HuggingFace🤗 Demo](https://huggingface.co/spaces/NATSpeech/DiffSpeech)
 :rocket: **新闻**: 
 - Feb.24, 2022: 我们的新工作`NeuralSVB` 被 ACL-2022 接收 [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2202.13277). [音频演示](https://neuralsvb.github.io).
 - Dec.01, 2021: DiffSinger被AAAI-2022接收.
 - Sep.29, 2021: 我们的新工作`PortaSpeech: Portable and High-Quality Generative Text-to-Speech` 被NeurIPS-2021接收 [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2109.15166) .
 - May.06, 2021: 我们把这篇DiffSinger提交到了公开论文网站: Arxiv [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2105.02446).
 ## 安装依赖
 ```sh
 conda create -n your_env_name python=3.8
 source activate your_env_name 
 pip install -r requirements_2080.txt   (GPU 2080Ti, CUDA 10.2)
 or pip install -r requirements_3090.txt   (GPU 3090, CUDA 11.4)
 ```
 ## DiffSpeech (语音合成的版本)
 ### 1. 准备工作
 #### 数据准备
 a) 下载并解压 [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/), 创建软链接: `ln -s /xxx/LJSpeech-1.1/ data/raw/`
 b) 下载并解压 [我们用MFA预处理好的对齐](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/mfa_outputs.tar):  `tar -xvf mfa_outputs.tar; mv mfa_outputs data/processed/ljspeech/`
 c) 按照如下脚本给数据集打包，打包后的二进制文件用于后续的训练和推理.
 ```sh
 export PYTHONPATH=.
 CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config configs/tts/lj/fs2.yaml
 # `data/binary/ljspeech` will be generated.
 ```
 #### 声码器准备
 我们提供了[HifiGAN](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0414_hifi_lj_1.zip)声码器的预训练模型.
 请在训练声学模型前，先把声码器文件解压到`checkpoints`里。
 ### 2. 训练样例
 首先你需要一个预训练好的FastSpeech2存档点. 你可以用[我们预训练好的模型](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip), 或者跑下面这个指令从零开始训练FastSpeech2:
 ```sh
 CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config configs/tts/lj/fs2.yaml --exp_name fs2_lj_1 --reset
 ```
 然后为了训练DiffSpeech, 运行:
 ```sh
 CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset
 ```
 记得针对你的路径修改`usr/configs/lj_ds_beta6.yaml`里"fs2_ckpt"这个参数.
 ### 3. 推理样例
 ```sh
 CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/lj_ds_beta6.yaml --exp_name lj_ds_beta6_1213 --reset --infer
 ```
 我们也提供了:
 - [DiffSpeech](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/lj_ds_beta6_1213.zip)的预训练模型;
 - [FastSpeech 2](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/fs2_lj_1.zip)的预训练模型, 这是为了DiffSpeech里的浅扩散机制;
 记得把预训练模型放在 `checkpoints` 目录.
 ## DiffSinger (歌声合成的版本)
 ### 0. 数据获取
 - 见 [申请表](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md).
 - 数据集 [预览](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_preview.zip).
 ### 1. Preparation
 #### 数据准备
 a) 下载并解压PopCS, 创建软链接: `ln -s /xxx/popcs/ data/processed/popcs`
 b) 按照如下脚本给数据集打包，打包后的二进制文件用于后续的训练和推理.
 ```sh
 export PYTHONPATH=.
 CUDA_VISIBLE_DEVICES=0 python data_gen/tts/bin/binarize.py --config usr/configs/popcs_ds_beta6.yaml
 # `data/binary/popcs-pmf0` 会生成出来.
 ```
 #### 声码器准备
 我们提供了[HifiGAN-Singing](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/0109_hifigan_bigpopcs_hop128.zip)的预训练模型, 它专门为了歌声合成系统设计, 采用了NSF的技术。
 请在训练声学模型前，先把声码器文件解压到`checkpoints`里。
 (更新: 你也可以将我们提供的[训练更多步数的存档点](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/model_ckpt_steps_1512000.ckpt)放到声码器的文件夹里)
 这个声码器是在大约70小时的较大数据集上训练的, 可以被认为是一个通用声码器。
 ### 2. 训练样例
 首先你需要一个预训练好的FFT-Singer. 你可以用[我们预训练好的模型](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip), 或者用如下脚本从零训练FFT-Singer:
 ```sh
 # First, train fft-singer;
 CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset
 # Then, infer fft-singer;
 CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_fs2.yaml --exp_name popcs_fs2_pmf0_1230 --reset --infer 
 ```
 然后, 为了训练DiffSinger, 运行:
 ```sh
 CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset
 ```
 记得针对你的路径修改`usr/configs/popcs_ds_beta6_offline.yaml`里"fs2_ckpt"这个参数.
 ### 3. 推理样例
 ```sh
 CUDA_VISIBLE_DEVICES=0 python tasks/run.py --config usr/configs/popcs_ds_beta6_offline.yaml --exp_name popcs_ds_beta6_offline_pmf0_1230 --reset --infer
 ```
 我们也提供了:
 - [DiffSinger](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_ds_beta6_offline_pmf0_1230.zip)的预训练模型;
 - [FFT-Singer](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/popcs_fs2_pmf0_1230.zip)的预训练模型, 这是为了DiffSinger里的浅扩散机制;
 记得把预训练模型放在 `checkpoints` 目录.
 *请注意：*
 -*我们原始论文中的PWG版本声码器已投入商业使用，因此我们提供此HifiGAN版本声码器作为替代品。*
 -*我们这篇论文假设提供真实的F0来进行实验，如[1][2][3]等前作所做的那样，重点在频谱建模上，而非F0曲线的预测。如果你想对MIDI数据进行实验，从MIDI和歌词预测F0曲线（显式或隐式），请查看文档[MIDI-old-version](README-SVS-opencpop-cascade.md) 或 [MIDI-new-version](README-SVS-opencpop-e2e.md)。目前已经支持的MIDI数据集有: Opencpop*
 [1] Adversarially trained multi-singer sequence-to-sequence singing synthesizer. Interspeech 2020.
 [2] SEQUENCE-TO-SEQUENCE SINGING SYNTHESIS USING THE FEED-FORWARD TRANSFORMER. ICASSP 2020.
 [3] DeepSinger : Singing Voice Synthesis with Data Mined From the Web. KDD 2020.
 ## Tensorboard
 ```sh
 tensorboard --logdir_spec exp_name
 ```
 <table style="width:100%">
  <tr>
    <td><img src="resources/tfb.png" alt="Tensorboard" height="250"></td>
  </tr>
 </table>
 ## Mel 可视化
 沿着纵轴, DiffSpeech: [0-80]; FastSpeech2: [80-160].
 <table style="width:100%">
  <tr>
    <th>DiffSpeech vs. FastSpeech 2</th>
  </tr>
  <tr>
    <td><img src="resources/diffspeech-fs2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
  </tr>
  <tr>
    <td><img src="resources/diffspeech-fs2-1.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
  </tr>
  <tr>
    <td><img src="resources/diffspeech-fs2-2.png" alt="DiffSpeech-vs-FastSpeech2" height="250"></td>
  </tr>
 </table>
 ## Audio Demos
 音频样本可以看我们的[样例页](https://diffsinger.github.io/).
 我们也放了部分由DiffSpeech+HifiGAN (标记为[P]) 和 GTmel+HifiGAN (标记为[G]) 生成的测试集音频样例在：[resources/demos_1213](../resources/demos_1213). 
 (对应这个预训练参数：[DiffSpeech](https://github.com/MoonInTheRiver/DiffSinger/releases/download/pretrain-model/lj_ds_beta6_1213.zip))
 ---
 :rocket: :rocket: :rocket: **更新:**
 新生成的歌声样例在：[resources/demos_0112](../resources/demos_0112).
 ## Citation
 如果本仓库对你的研究和工作有用，请引用以下论文：
    @article{liu2021diffsinger,
      title={Diffsinger: Singing voice synthesis via shallow diffusion mechanism},
      author={Liu, Jinglin and Li, Chengxi and Ren, Yi and Chen, Feiyang and Liu, Peng and Zhao, Zhou},
      journal={arXiv preprint arXiv:2105.02446},
      volume={2},
      year={2021}}
 ## 鸣谢
 我们的代码基于如下仓库:
 * [denoising-diffusion-pytorch](https://github.com/lucidrains/denoising-diffusion-pytorch)
 * [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning)
 * [ParallelWaveGAN](https://github.com/kan-bayashi/ParallelWaveGAN)
 * [HifiGAN](https://github.com/jik876/hifi-gan)
 * [espnet](https://github.com/espnet/espnet)
 * [DiffWave](https://github.com/lmnt-com/diffwave)
--- a/text_to_sing/DiffSinger/infer_out/example_out.wav
+++ b/text_to_sing/DiffSinger/infer_out/example_out.wav
--- a/text_to_sing/DiffSinger/infer_out/example_out_0315.wav
+++ b/text_to_sing/DiffSinger/infer_out/example_out_0315.wav
--- a/text_to_sing/DiffSinger/packages.txt
+++ b/text_to_sing/DiffSinger/packages.txt
@@ -1 +0,0 @@
 libsndfile1
--- a/text_to_sing/DiffSinger/requirements.txt
+++ b/text_to_sing/DiffSinger/requirements.txt
@@ -1,118 +0,0 @@
 absl-py==0.11.0
 alignment==1.0.10
 altgraph==0.17
 appdirs==1.4.4
 async-timeout==3.0.1
 audioread==2.1.9
 backcall==0.2.0
 blinker==1.4
 brotlipy==0.7.0
 cachetools==4.2.0
 certifi==2020.12.5
 cffi==1.14.4
 chardet==4.0.0
 click==7.1.2
 cycler==0.10.0
 Cython==0.29.21
 cytoolz==0.11.0
 decorator==4.4.2
 Distance==0.1.3
 einops==0.3.0
 et-xmlfile==1.0.1
 fsspec==0.8.4
 future==0.18.2
 g2p-en==2.1.0
 g2pM==0.1.2.5
 google-auth==1.24.0
 google-auth-oauthlib==0.4.2
 grpcio==1.34.0
 h5py==3.1.0
 horology==1.1.0
 httplib2==0.18.1
 idna==2.10
 imageio==2.9.0
 inflect==5.0.2
 ipdb==0.13.4
 ipython==7.19.0
 ipython-genutils==0.2.0
 jdcal==1.4.1
 jedi==0.17.2
 jieba==0.42.1
 jiwer==2.2.0
 joblib==1.0.0
 kiwisolver==1.3.1
 librosa==0.8.0
 llvmlite==0.31.0
 Markdown==3.3.3
 matplotlib==3.3.3
 miditoolkit==0.1.7
 mido==1.2.9
 music21==5.7.2
 networkx==2.5
 nltk==3.5
 numba==0.48.0
 numpy==1.19.4
 oauth2client==4.1.3
 oauthlib==3.1.0
 olefile==0.46
 packaging==20.7
 pandas==1.2.0
 parso==0.7.1
 patsy==0.5.1
 pexpect==4.8.0
 pickleshare==0.7.5
 Pillow==8.0.1
 pooch==1.3.0
 praat-parselmouth==0.3.3
 prompt-toolkit==3.0.8
 protobuf==3.13.0
 ptyprocess==0.6.0
 pyasn1==0.4.8
 pyasn1-modules==0.2.8
 pycparser==2.20
 pycwt==0.3.0a22
 Pygments==2.7.3
 PyInstaller==3.6
 PyJWT==1.7.1
 pyloudnorm==0.1.0
 pyparsing==2.4.7
 pypinyin==0.39.0
 PySocks==1.7.1
 python-dateutil==2.8.1
 python-Levenshtein==0.12.0
 pytorch-lightning==0.7.1
 pytz==2020.5
 PyWavelets==1.1.1
 pyworld==0.2.12
 PyYAML==5.3.1
 regex==2020.11.13
 requests==2.25.1
 requests-oauthlib==1.3.0
 resampy==0.2.2
 Resemblyzer==0.1.1.dev0
 rsa==4.6
 scikit-image==0.16.2
 scikit-learn==0.22.2.post1
 scipy==1.5.4
 six==1.15.0
 SoundFile==0.10.3.post1
 stopit==1.1.1
 tensorboard==2.4.0
 tensorboard-plugin-wit==1.7.0
 tensorboardX==2.1
 TextGrid==1.5
 threadpoolctl==2.1.0
 toolz==0.11.1
 torch==1.6.0
 torchaudio==0.6.0
 torchvision==0.7.0
 tqdm==4.54.1
 traitlets==5.0.5
 typing==3.7.4.3
 urllib3==1.26.2
 uuid==1.30
 wcwidth==0.2.5
 webencodings==0.5.1
 webrtcvad==2.0.10
 Werkzeug==1.0.1
 pretty-midi==0.2.9
--- a/text_to_sing/DiffSinger/requirements_3090.txt
+++ b/text_to_sing/DiffSinger/requirements_3090.txt
@@ -1,76 +0,0 @@
 absl-py==0.15.0
 appdirs==1.4.4
 audioread==2.1.9
 beautifulsoup4==4.10.0
 certifi==2021.10.8
 cffi==1.15.0
 charset-normalizer==2.0.7
 cycler==0.11.0
 Cython==0.29.24
 decorator==4.4.2
 dlib==19.22.1
 einops==0.3.2
 future==0.18.2
 g2p-en==2.1.0
 google==3.0.0
 grpcio==1.42.0
 h5py==2.8.0
 horology==1.2.0
 idna==3.3
 imageio==2.10.1
 imageio-ffmpeg==0.4.5
 importlib-metadata==4.8.1
 joblib==1.1.0
 kiwisolver==1.3.2
 librosa==0.8.0
 llvmlite==0.31.0
 Markdown==3.3.4
 matplotlib==3.4.3
 miditoolkit==0.1.7
 moviepy==1.0.3
 numba==0.48.0
 numpy==1.20.0
 opencv-python==4.5.4.58
 packaging==21.2
 pandas==1.3.4
 Pillow==8.4.0
 pooch==1.5.2
 praat-parselmouth==0.3.3
 proglog==0.1.9
 protobuf==3.19.1
 pycparser==2.20
 pycwt==0.3.0a22
 pydub==0.25.1
 pyloudnorm==0.1.0
 pyparsing==2.4.7
 pypinyin==0.43.0
 python-dateutil==2.8.2
 pytorch-lightning==0.7.1
 pytorch-ssim==0.1
 pytz==2021.3
 pyworld==0.3.0
 PyYAML==6.0
 requests==2.26.0
 resampy==0.2.2
 Resemblyzer==0.1.1.dev0
 scikit-image==0.16.2
 scikit-learn==0.22
 scipy==1.3.0
 six==1.16.0
 sklearn==0.0
 SoundFile==0.10.3.post1
 soupsieve==2.3
 sympy==1.9
 tensorboard==1.15.0
 tensorboardX==2.4
 test-tube==0.7.5
 TextGrid==1.5
 torch @ https://download.pytorch.org/whl/nightly/cu113/torch-1.10.0.dev20210907%2Bcu113-cp37-cp37m-linux_x86_64.whl
 torchvision==0.9.1
 tqdm==4.62.3
 typing-extensions==3.10.0.2
 urllib3==1.26.7
 uuid==1.30
 webrtcvad==2.0.10
 Werkzeug==2.0.2
 zipp==3.6.0
		`@@ -1 +0,0 @@`
			`["!", ",", ".", ";", "<BOS>", "<EOS>", "?", "AA0", "AA1", "AA2", "AE0", "AE1", "AE2", "AH0", "AH1", "AH2", "AO0", "AO1", "AO2", "AW0", "AW1", "AW2", "AY0", "AY1", "AY2", "B", "CH", "D", "DH", "EH0", "EH1", "EH2", "ER0", "ER1", "ER2", "EY0", "EY1", "EY2", "F", "G", "HH", "IH0", "IH1", "IH2", "IY0", "IY1", "IY2", "JH", "K", "L", "M", "N", "NG", "OW0", "OW1", "OW2", "OY0", "OY1", "OY2", "P", "R", "S", "SH", "T", "TH", "UH0", "UH1", "UH2", "UW0", "UW1", "UW2", "V", "W", "Y", "Z", "ZH", "\|"]`