diff --git a/TTS/speaker_encoder/utils/prepare_voxceleb.py b/TTS/speaker_encoder/utils/prepare_voxceleb.py index 7bcbaf95..1901a21c 100644 --- a/TTS/speaker_encoder/utils/prepare_voxceleb.py +++ b/TTS/speaker_encoder/utils/prepare_voxceleb.py @@ -25,12 +25,11 @@ import subprocess import sys import zipfile +import pandas import soundfile as sf import tensorflow as tf from absl import logging -import pandas - gfile = tf.compat.v1.gfile SUBSETS = { diff --git a/TTS/tts/layers/glow_tts/monotonic_align/core.pyx b/TTS/tts/layers/glow_tts/monotonic_align/core.pyx index 6aabccc4..091fcc3a 100644 --- a/TTS/tts/layers/glow_tts/monotonic_align/core.pyx +++ b/TTS/tts/layers/glow_tts/monotonic_align/core.pyx @@ -1,6 +1,8 @@ import numpy as np -cimport numpy as np + cimport cython +cimport numpy as np + from cython.parallel import prange diff --git a/notebooks/dataset_analysis/analyze.py b/notebooks/dataset_analysis/analyze.py index 66d008cd..6c6bc582 100644 --- a/notebooks/dataset_analysis/analyze.py +++ b/notebooks/dataset_analysis/analyze.py @@ -6,13 +6,12 @@ import random from statistics import StatisticsError, mean, median, mode, stdev import matplotlib.pyplot as plt - import seaborn as sns from text.cmudict import CMUDict def get_audio_seconds(frames): - return (frames*12.5)/1000 + return (frames * 12.5) / 1000 def append_data_statistics(meta_data): @@ -29,9 +28,7 @@ def append_data_statistics(meta_data): median_audio_len = median(audio_len_list) try: - std = stdev( - d["audio_len"] for d in data - ) + std = stdev(d["audio_len"] for d in data) except StatisticsError: std = 0 @@ -46,24 +43,22 @@ def process_meta_data(path): meta_data = {} # load meta data - with open(path, 'r') as f: - data = csv.reader(f, delimiter='|') + with open(path, "r") as f: + data = csv.reader(f, delimiter="|") for row in data: frames = int(row[2]) utt = row[3] audio_len = get_audio_seconds(frames) char_count = len(utt) if not meta_data.get(char_count): - meta_data[char_count] = { - "data": [] - } + meta_data[char_count] = {"data": []} meta_data[char_count]["data"].append( { "utt": utt, "frames": frames, "audio_len": audio_len, - "row": "{}|{}|{}|{}".format(row[0], row[1], row[2], row[3]) + "row": "{}|{}|{}|{}".format(row[0], row[1], row[2], row[3]), } ) @@ -74,30 +69,30 @@ def process_meta_data(path): def get_data_points(meta_data): x = meta_data - y_avg = [meta_data[d]['mean'] for d in meta_data] - y_mode = [meta_data[d]['mode'] for d in meta_data] - y_median = [meta_data[d]['median'] for d in meta_data] - y_std = [meta_data[d]['std'] for d in meta_data] - y_num_samples = [len(meta_data[d]['data']) for d in meta_data] + y_avg = [meta_data[d]["mean"] for d in meta_data] + y_mode = [meta_data[d]["mode"] for d in meta_data] + y_median = [meta_data[d]["median"] for d in meta_data] + y_std = [meta_data[d]["std"] for d in meta_data] + y_num_samples = [len(meta_data[d]["data"]) for d in meta_data] return { "x": x, "y_avg": y_avg, "y_mode": y_mode, "y_median": y_median, "y_std": y_std, - "y_num_samples": y_num_samples + "y_num_samples": y_num_samples, } def save_training(file_path, meta_data): rows = [] for char_cnt in meta_data: - data = meta_data[char_cnt]['data'] + data = meta_data[char_cnt]["data"] for d in data: - rows.append(d['row'] + "\n") + rows.append(d["row"] + "\n") random.shuffle(rows) - with open(file_path, 'w+') as f: + with open(file_path, "w+") as f: for row in rows: f.write(row) @@ -108,15 +103,15 @@ def plot(meta_data, save_path=None): save = True graph_data = get_data_points(meta_data) - x = graph_data['x'] - y_avg = graph_data['y_avg'] - y_std = graph_data['y_std'] - y_mode = graph_data['y_mode'] - y_median = graph_data['y_median'] - y_num_samples = graph_data['y_num_samples'] + x = graph_data["x"] + y_avg = graph_data["y_avg"] + y_std = graph_data["y_std"] + y_mode = graph_data["y_mode"] + y_median = graph_data["y_median"] + y_num_samples = graph_data["y_num_samples"] plt.figure() - plt.plot(x, y_avg, 'ro') + plt.plot(x, y_avg, "ro") plt.xlabel("character lengths", fontsize=30) plt.ylabel("avg seconds", fontsize=30) if save: @@ -124,7 +119,7 @@ def plot(meta_data, save_path=None): plt.savefig(os.path.join(save_path, name)) plt.figure() - plt.plot(x, y_mode, 'ro') + plt.plot(x, y_mode, "ro") plt.xlabel("character lengths", fontsize=30) plt.ylabel("mode seconds", fontsize=30) if save: @@ -132,7 +127,7 @@ def plot(meta_data, save_path=None): plt.savefig(os.path.join(save_path, name)) plt.figure() - plt.plot(x, y_median, 'ro') + plt.plot(x, y_median, "ro") plt.xlabel("character lengths", fontsize=30) plt.ylabel("median seconds", fontsize=30) if save: @@ -140,7 +135,7 @@ def plot(meta_data, save_path=None): plt.savefig(os.path.join(save_path, name)) plt.figure() - plt.plot(x, y_std, 'ro') + plt.plot(x, y_std, "ro") plt.xlabel("character lengths", fontsize=30) plt.ylabel("standard deviation", fontsize=30) if save: @@ -148,7 +143,7 @@ def plot(meta_data, save_path=None): plt.savefig(os.path.join(save_path, name)) plt.figure() - plt.plot(x, y_num_samples, 'ro') + plt.plot(x, y_num_samples, "ro") plt.xlabel("character lengths", fontsize=30) plt.ylabel("number of samples", fontsize=30) if save: @@ -161,8 +156,8 @@ def plot_phonemes(train_path, cmu_dict_path, save_path): phonemes = {} - with open(train_path, 'r') as f: - data = csv.reader(f, delimiter='|') + with open(train_path, "r") as f: + data = csv.reader(f, delimiter="|") phonemes["None"] = 0 for row in data: words = row[3].split() @@ -194,15 +189,12 @@ def plot_phonemes(train_path, cmu_dict_path, save_path): def main(): parser = argparse.ArgumentParser() parser.add_argument( - '--train_file_path', required=True, - help='this is the path to the train.txt file that the preprocess.py script creates' - ) - parser.add_argument( - '--save_to', help='path to save charts of data to' - ) - parser.add_argument( - '--cmu_dict_path', help='give cmudict-0.7b to see phoneme distribution' + "--train_file_path", + required=True, + help="this is the path to the train.txt file that the preprocess.py script creates", ) + parser.add_argument("--save_to", help="path to save charts of data to") + parser.add_argument("--cmu_dict_path", help="give cmudict-0.7b to see phoneme distribution") args = parser.parse_args() meta_data = process_meta_data(args.train_file_path) plt.rcParams["figure.figsize"] = (10, 5) @@ -213,5 +205,6 @@ def main(): plt.show() -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/pyproject.toml b/pyproject.toml index 335303d1..5c742966 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,3 +26,8 @@ exclude = ''' # the root of the project ) ''' + +[tool.isort] +line_length = 120 +profile = "black" +multi_line_output = 3 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 42544666..c479da9b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,10 +18,12 @@ bokeh==1.4.0 pysbd # pyworld soundfile -nose==1.3.7 -cardboardlint==1.3.0 -pylint==2.5.3 gdown umap-learn==0.4.6 cython -pyyaml \ No newline at end of file +pyyaml +# quality and style +nose +black +isort +pylint==2.7.4 \ No newline at end of file diff --git a/tests/test_audio.py b/tests/test_audio.py index 75141730..8065383e 100644 --- a/tests/test_audio.py +++ b/tests/test_audio.py @@ -10,7 +10,7 @@ OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") os.makedirs(OUT_PATH, exist_ok=True) -conf = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) +conf = load_config(os.path.join(get_tests_input_path(), "test_config.json")) # pylint: disable=protected-access @@ -20,10 +20,10 @@ class TestAudio(unittest.TestCase): self.ap = AudioProcessor(**conf.audio) def test_audio_synthesis(self): - """ 1. load wav - 2. set normalization parameters - 3. extract mel-spec - 4. invert to wav and save the output + """1. load wav + 2. set normalization parameters + 3. extract mel-spec + 4. invert to wav and save the output """ print(" > Sanity check for the process wav -> mel -> wav") @@ -35,23 +35,24 @@ class TestAudio(unittest.TestCase): wav = self.ap.load_wav(WAV_FILE) mel = self.ap.melspectrogram(wav) wav_ = self.ap.inv_melspectrogram(mel) - file_name = "/audio_test-melspec_max_norm_{}-signal_norm_{}-symmetric_{}-clip_norm_{}.wav"\ - .format(max_norm, signal_norm, symmetric_norm, clip_norm) + file_name = "/audio_test-melspec_max_norm_{}-signal_norm_{}-symmetric_{}-clip_norm_{}.wav".format( + max_norm, signal_norm, symmetric_norm, clip_norm + ) print(" | > Creating wav file at : ", file_name) self.ap.save_wav(wav_, OUT_PATH + file_name) # maxnorm = 1.0 - _test(1., False, False, False) - _test(1., True, False, False) - _test(1., True, True, False) - _test(1., True, False, True) - _test(1., True, True, True) + _test(1.0, False, False, False) + _test(1.0, True, False, False) + _test(1.0, True, True, False) + _test(1.0, True, False, True) + _test(1.0, True, True, True) # maxnorm = 4.0 - _test(4., False, False, False) - _test(4., True, False, False) - _test(4., True, True, False) - _test(4., True, False, True) - _test(4., True, True, True) + _test(4.0, False, False, False) + _test(4.0, True, False, False) + _test(4.0, True, True, False) + _test(4.0, True, False, True) + _test(4.0, True, True, True) def test_normalize(self): """Check normalization and denormalization for range values and consistency """ @@ -67,7 +68,9 @@ class TestAudio(unittest.TestCase): self.ap.clip_norm = False self.ap.max_norm = 4.0 x_norm = self.ap.normalize(x) - print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}") + print( + f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() @@ -81,8 +84,9 @@ class TestAudio(unittest.TestCase): self.ap.clip_norm = True self.ap.max_norm = 4.0 x_norm = self.ap.normalize(x) - print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}") - + print( + f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) assert (x_old - x).sum() == 0 # check value range @@ -97,13 +101,14 @@ class TestAudio(unittest.TestCase): self.ap.clip_norm = False self.ap.max_norm = 4.0 x_norm = self.ap.normalize(x) - print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}") - + print( + f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() - assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min() #pylint: disable=invalid-unary-operand-type + assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min() # pylint: disable=invalid-unary-operand-type assert x_norm.min() <= 0, x_norm.min() # check denorm. x_ = self.ap.denormalize(x_norm) @@ -114,13 +119,14 @@ class TestAudio(unittest.TestCase): self.ap.clip_norm = True self.ap.max_norm = 4.0 x_norm = self.ap.normalize(x) - print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}") - + print( + f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm, x_norm.max() - assert x_norm.min() >= -self.ap.max_norm, x_norm.min() #pylint: disable=invalid-unary-operand-type + assert x_norm.min() >= -self.ap.max_norm, x_norm.min() # pylint: disable=invalid-unary-operand-type assert x_norm.min() <= 0, x_norm.min() # check denorm. x_ = self.ap.denormalize(x_norm) @@ -130,8 +136,9 @@ class TestAudio(unittest.TestCase): self.ap.symmetric_norm = False self.ap.max_norm = 1.0 x_norm = self.ap.normalize(x) - print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}") - + print( + f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) assert (x_old - x).sum() == 0 assert x_norm.max() <= self.ap.max_norm, x_norm.max() @@ -143,22 +150,23 @@ class TestAudio(unittest.TestCase): self.ap.symmetric_norm = True self.ap.max_norm = 1.0 x_norm = self.ap.normalize(x) - print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}") - + print( + f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}" + ) assert (x_old - x).sum() == 0 assert x_norm.max() <= self.ap.max_norm, x_norm.max() - assert x_norm.min() >= -self.ap.max_norm, x_norm.min() #pylint: disable=invalid-unary-operand-type + assert x_norm.min() >= -self.ap.max_norm, x_norm.min() # pylint: disable=invalid-unary-operand-type assert x_norm.min() < 0, x_norm.min() x_ = self.ap.denormalize(x_norm) assert (x - x_).sum() < 1e-3 def test_scaler(self): - scaler_stats_path = os.path.join(get_tests_input_path(), 'scale_stats.npy') - conf.audio['stats_path'] = scaler_stats_path - conf.audio['preemphasis'] = 0.0 - conf.audio['do_trim_silence'] = True - conf.audio['signal_norm'] = True + scaler_stats_path = os.path.join(get_tests_input_path(), "scale_stats.npy") + conf.audio["stats_path"] = scaler_stats_path + conf.audio["preemphasis"] = 0.0 + conf.audio["do_trim_silence"] = True + conf.audio["signal_norm"] = True ap = AudioProcessor(**conf.audio) mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path) diff --git a/tests/test_feed_forward_layers.py b/tests/test_feed_forward_layers.py index a19e808c..1db980a3 100644 --- a/tests/test_feed_forward_layers.py +++ b/tests/test_feed_forward_layers.py @@ -9,99 +9,99 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") def test_encoder(): input_dummy = torch.rand(8, 14, 37).to(device) - input_lengths = torch.randint(31, 37, (8, )).long().to(device) + input_lengths = torch.randint(31, 37, (8,)).long().to(device) input_lengths[-1] = 37 - input_mask = torch.unsqueeze( - sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device) + input_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device) # relative positional transformer encoder - layer = Encoder(out_channels=11, - in_hidden_channels=14, - encoder_type='relative_position_transformer', - encoder_params={ - 'hidden_channels_ffn': 768, - 'num_heads': 2, - "kernel_size": 3, - "dropout_p": 0.1, - "num_layers": 6, - "rel_attn_window_size": 4, - "input_length": None - }).to(device) + layer = Encoder( + out_channels=11, + in_hidden_channels=14, + encoder_type="relative_position_transformer", + encoder_params={ + "hidden_channels_ffn": 768, + "num_heads": 2, + "kernel_size": 3, + "dropout_p": 0.1, + "num_layers": 6, + "rel_attn_window_size": 4, + "input_length": None, + }, + ).to(device) output = layer(input_dummy, input_mask) assert list(output.shape) == [8, 11, 37] # residual conv bn encoder - layer = Encoder(out_channels=11, - in_hidden_channels=14, - encoder_type='residual_conv_bn', - encoder_params={ - "kernel_size": 4, - "dilations": 4 * [1, 2, 4] + [1], - "num_conv_blocks": 2, - "num_res_blocks": 13 - }).to(device) + layer = Encoder( + out_channels=11, + in_hidden_channels=14, + encoder_type="residual_conv_bn", + encoder_params={"kernel_size": 4, "dilations": 4 * [1, 2, 4] + [1], "num_conv_blocks": 2, "num_res_blocks": 13}, + ).to(device) output = layer(input_dummy, input_mask) assert list(output.shape) == [8, 11, 37] # FFTransformer encoder - layer = Encoder(out_channels=14, - in_hidden_channels=14, - encoder_type='fftransformer', - encoder_params={ - "hidden_channels_ffn": 31, - "num_heads": 2, - "num_layers": 2, - "dropout_p": 0.1 - }).to(device) + layer = Encoder( + out_channels=14, + in_hidden_channels=14, + encoder_type="fftransformer", + encoder_params={"hidden_channels_ffn": 31, "num_heads": 2, "num_layers": 2, "dropout_p": 0.1}, + ).to(device) output = layer(input_dummy, input_mask) assert list(output.shape) == [8, 14, 37] def test_decoder(): input_dummy = torch.rand(8, 128, 37).to(device) - input_lengths = torch.randint(31, 37, (8, )).long().to(device) + input_lengths = torch.randint(31, 37, (8,)).long().to(device) input_lengths[-1] = 37 - input_mask = torch.unsqueeze( - sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device) + input_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device) # residual bn conv decoder layer = Decoder(out_channels=11, in_hidden_channels=128).to(device) output = layer(input_dummy, input_mask) assert list(output.shape) == [8, 11, 37] # transformer decoder - layer = Decoder(out_channels=11, - in_hidden_channels=128, - decoder_type='relative_position_transformer', - decoder_params={ - 'hidden_channels_ffn': 128, - 'num_heads': 2, - "kernel_size": 3, - "dropout_p": 0.1, - "num_layers": 8, - "rel_attn_window_size": 4, - "input_length": None - }).to(device) + layer = Decoder( + out_channels=11, + in_hidden_channels=128, + decoder_type="relative_position_transformer", + decoder_params={ + "hidden_channels_ffn": 128, + "num_heads": 2, + "kernel_size": 3, + "dropout_p": 0.1, + "num_layers": 8, + "rel_attn_window_size": 4, + "input_length": None, + }, + ).to(device) output = layer(input_dummy, input_mask) assert list(output.shape) == [8, 11, 37] # wavenet decoder - layer = Decoder(out_channels=11, - in_hidden_channels=128, - decoder_type='wavenet', - decoder_params={ - "num_blocks": 12, - "hidden_channels": 192, - "kernel_size": 5, - "dilation_rate": 1, - "num_layers": 4, - "dropout_p": 0.05 - }).to(device) + layer = Decoder( + out_channels=11, + in_hidden_channels=128, + decoder_type="wavenet", + decoder_params={ + "num_blocks": 12, + "hidden_channels": 192, + "kernel_size": 5, + "dilation_rate": 1, + "num_layers": 4, + "dropout_p": 0.05, + }, + ).to(device) output = layer(input_dummy, input_mask) # FFTransformer decoder - layer = Decoder(out_channels=11, - in_hidden_channels=128, - decoder_type='fftransformer', - decoder_params={ - 'hidden_channels_ffn': 31, - 'num_heads': 2, - "dropout_p": 0.1, - "num_layers": 2, - }).to(device) + layer = Decoder( + out_channels=11, + in_hidden_channels=128, + decoder_type="fftransformer", + decoder_params={ + "hidden_channels_ffn": 31, + "num_heads": 2, + "dropout_p": 0.1, + "num_layers": 2, + }, + ).to(device) output = layer(input_dummy, input_mask) assert list(output.shape) == [8, 11, 37] diff --git a/tests/test_glow_tts.py b/tests/test_glow_tts.py index 66d594e2..8e699faf 100644 --- a/tests/test_glow_tts.py +++ b/tests/test_glow_tts.py @@ -11,13 +11,13 @@ from TTS.tts.models.glow_tts import GlowTTS from TTS.utils.audio import AudioProcessor from TTS.utils.io import load_config -#pylint: disable=unused-variable +# pylint: disable=unused-variable torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -c = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) +c = load_config(os.path.join(get_tests_input_path(), "test_config.json")) ap = AudioProcessor(**c.audio) WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") @@ -32,11 +32,11 @@ class GlowTTSTrainTest(unittest.TestCase): @staticmethod def test_train_step(): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) - input_lengths = torch.randint(100, 129, (8, )).long().to(device) + input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, c.audio['num_mels'], 30).to(device) - mel_lengths = torch.randint(20, 30, (8, )).long().to(device) - speaker_ids = torch.randint(0, 5, (8, )).long().to(device) + mel_spec = torch.rand(8, c.audio["num_mels"], 30).to(device) + mel_lengths = torch.randint(20, 30, (8,)).long().to(device) + speaker_ids = torch.randint(0, 5, (8,)).long().to(device) criterion = GlowTTSLoss() @@ -47,27 +47,28 @@ class GlowTTSTrainTest(unittest.TestCase): hidden_channels_dec=48, hidden_channels_dp=32, out_channels=80, - encoder_type='rel_pos_transformer', + encoder_type="rel_pos_transformer", encoder_params={ - 'kernel_size': 3, - 'dropout_p': 0.1, - 'num_layers': 6, - 'num_heads': 2, - 'hidden_channels_ffn': 16, # 4 times the hidden_channels - 'input_length': None + "kernel_size": 3, + "dropout_p": 0.1, + "num_layers": 6, + "num_heads": 2, + "hidden_channels_ffn": 16, # 4 times the hidden_channels + "input_length": None, }, use_encoder_prenet=True, num_flow_blocks_dec=12, kernel_size_dec=5, dilation_rate=1, num_block_layers=4, - dropout_p_dec=0., + dropout_p_dec=0.0, num_speakers=0, c_in_channels=0, num_splits=4, num_squeeze=1, sigmoid_scale=False, - mean_only=False).to(device) + mean_only=False, + ).to(device) # reference model to compare model weights model_ref = GlowTTS( @@ -76,38 +77,37 @@ class GlowTTSTrainTest(unittest.TestCase): hidden_channels_dec=48, hidden_channels_dp=32, out_channels=80, - encoder_type='rel_pos_transformer', + encoder_type="rel_pos_transformer", encoder_params={ - 'kernel_size': 3, - 'dropout_p': 0.1, - 'num_layers': 6, - 'num_heads': 2, - 'hidden_channels_ffn': 16, # 4 times the hidden_channels - 'input_length': None + "kernel_size": 3, + "dropout_p": 0.1, + "num_layers": 6, + "num_heads": 2, + "hidden_channels_ffn": 16, # 4 times the hidden_channels + "input_length": None, }, use_encoder_prenet=True, num_flow_blocks_dec=12, kernel_size_dec=5, dilation_rate=1, num_block_layers=4, - dropout_p_dec=0., + dropout_p_dec=0.0, num_speakers=0, c_in_channels=0, num_splits=4, num_squeeze=1, sigmoid_scale=False, - mean_only=False).to(device) + mean_only=False, + ).to(device) model.train() - print(" > Num parameters for GlowTTS model:%s" % - (count_parameters(model))) + print(" > Num parameters for GlowTTS model:%s" % (count_parameters(model))) # pass the state to ref model model_ref.load_state_dict(copy.deepcopy(model.state_dict())) count = 0 - for param, param_ref in zip(model.parameters(), - model_ref.parameters()): + for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 @@ -115,18 +115,17 @@ class GlowTTSTrainTest(unittest.TestCase): for _ in range(5): optimizer.zero_grad() z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, None) - loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, - o_dur_log, o_total_dur, input_lengths) - loss = loss_dict['loss'] + input_dummy, input_lengths, mel_spec, mel_lengths, None + ) + loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, o_dur_log, o_total_dur, input_lengths) + loss = loss_dict["loss"] loss.backward() optimizer.step() # check parameter changes count = 0 - for param, param_ref in zip(model.parameters(), - model_ref.parameters()): - assert (param != param_ref).any( - ), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref) + for param, param_ref in zip(model.parameters(), model_ref.parameters()): + assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref + ) count += 1 diff --git a/tests/test_layers.py b/tests/test_layers.py index 9224c673..9b89e645 100644 --- a/tests/test_layers.py +++ b/tests/test_layers.py @@ -10,7 +10,7 @@ from TTS.tts.utils.generic_utils import sequence_mask class PrenetTests(unittest.TestCase): - def test_in_out(self): #pylint: disable=no-self-use + def test_in_out(self): # pylint: disable=no-self-use layer = Prenet(128, out_features=[256, 128]) dummy_input = T.rand(4, 128) @@ -22,7 +22,7 @@ class PrenetTests(unittest.TestCase): class CBHGTests(unittest.TestCase): def test_in_out(self): - #pylint: disable=attribute-defined-outside-init + # pylint: disable=attribute-defined-outside-init layer = self.cbhg = CBHG( 128, K=8, @@ -30,7 +30,8 @@ class CBHGTests(unittest.TestCase): conv_projections=[160, 128], highway_features=80, gru_features=80, - num_highways=4) + num_highways=4, + ) # B x D x T dummy_input = T.rand(4, 128, 8) @@ -53,26 +54,27 @@ class DecoderTests(unittest.TestCase): attn_norm="sigmoid", attn_K=5, attn_type="original", - prenet_type='original', + prenet_type="original", prenet_dropout=True, forward_attn=True, trans_agent=True, forward_attn_mask=True, location_attn=True, - separate_stopnet=True) + separate_stopnet=True, + ) dummy_input = T.rand(4, 8, 256) dummy_memory = T.rand(4, 2, 80) - output, alignment, stop_tokens = layer( - dummy_input, dummy_memory, mask=None) + output, alignment, stop_tokens = layer(dummy_input, dummy_memory, mask=None) assert output.shape[0] == 4 assert output.shape[1] == 80, "size not {}".format(output.shape[1]) assert output.shape[2] == 2, "size not {}".format(output.shape[2]) assert stop_tokens.shape[0] == 4 + class EncoderTests(unittest.TestCase): - def test_in_out(self): #pylint: disable=no-self-use + def test_in_out(self): # pylint: disable=no-self-use layer = Encoder(128) dummy_input = T.rand(4, 8, 128) @@ -85,7 +87,7 @@ class EncoderTests(unittest.TestCase): class L1LossMaskedTests(unittest.TestCase): - def test_in_out(self): #pylint: disable=no-self-use + def test_in_out(self): # pylint: disable=no-self-use # test input == target layer = L1LossMasked(seq_len_norm=False) dummy_input = T.ones(4, 8, 128).float() @@ -105,16 +107,14 @@ class L1LossMaskedTests(unittest.TestCase): dummy_input = T.ones(4, 8, 128).float() dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.arange(5, 9)).long() - mask = ( - (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) + mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) assert output.item() == 1.0, "1.0 vs {}".format(output.item()) dummy_input = T.rand(4, 8, 128).float() dummy_target = dummy_input.detach() dummy_length = (T.arange(5, 9)).long() - mask = ( - (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) + mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) assert output.item() == 0, "0 vs {}".format(output.item()) @@ -138,22 +138,20 @@ class L1LossMaskedTests(unittest.TestCase): dummy_input = T.ones(4, 8, 128).float() dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.arange(5, 9)).long() - mask = ( - (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) + mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item()) dummy_input = T.rand(4, 8, 128).float() dummy_target = dummy_input.detach() dummy_length = (T.arange(5, 9)).long() - mask = ( - (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) + mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) assert output.item() == 0, "0 vs {}".format(output.item()) class SSIMLossTests(unittest.TestCase): - def test_in_out(self): #pylint: disable=no-self-use + def test_in_out(self): # pylint: disable=no-self-use # test input == target layer = SSIMLoss() dummy_input = T.ones(4, 8, 128).float() @@ -173,16 +171,14 @@ class SSIMLossTests(unittest.TestCase): dummy_input = T.ones(4, 8, 128).float() dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.arange(5, 9)).long() - mask = ( - (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) + mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) assert abs(output.item() - 1.0) < 1e-4, "1.0 vs {}".format(output.item()) dummy_input = T.rand(4, 8, 128).float() dummy_target = dummy_input.detach() dummy_length = (T.arange(5, 9)).long() - mask = ( - (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) + mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) assert output.item() == 0, "0 vs {}".format(output.item()) @@ -206,15 +202,13 @@ class SSIMLossTests(unittest.TestCase): dummy_input = T.ones(4, 8, 128).float() dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.arange(5, 9)).long() - mask = ( - (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) + mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item()) dummy_input = T.rand(4, 8, 128).float() dummy_target = dummy_input.detach() dummy_length = (T.arange(5, 9)).long() - mask = ( - (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) + mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) assert output.item() == 0, "0 vs {}".format(output.item()) diff --git a/tests/test_loader.py b/tests/test_loader.py index e711cc03..6174865b 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -12,11 +12,11 @@ from TTS.tts.datasets.preprocess import ljspeech from TTS.utils.audio import AudioProcessor from TTS.utils.io import load_config -#pylint: disable=unused-variable +# pylint: disable=unused-variable OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") os.makedirs(OUTPATH, exist_ok=True) -c = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) +c = load_config(os.path.join(get_tests_input_path(), "test_config.json")) ok_ljspeech = os.path.exists(c.data_path) DATA_EXIST = True @@ -33,25 +33,27 @@ class TestTTSDataset(unittest.TestCase): self.ap = AudioProcessor(**c.audio) def _create_dataloader(self, batch_size, r, bgs): - items = ljspeech(c.data_path, 'metadata.csv') + items = ljspeech(c.data_path, "metadata.csv") dataset = TTSDataset.MyDataset( r, c.text_cleaner, compute_linear_spec=True, ap=self.ap, meta_data=items, - tp=c.characters if 'characters' in c.keys() else None, + tp=c.characters if "characters" in c.keys() else None, batch_group_size=bgs, min_seq_len=c.min_seq_len, max_seq_len=float("inf"), - use_phonemes=False) + use_phonemes=False, + ) dataloader = DataLoader( dataset, batch_size=batch_size, shuffle=False, collate_fn=dataset.collate_fn, drop_last=True, - num_workers=c.num_loader_workers) + num_workers=c.num_loader_workers, + ) return dataloader, dataset def test_loader(self): @@ -72,18 +74,17 @@ class TestTTSDataset(unittest.TestCase): neg_values = text_input[text_input < 0] check_count = len(neg_values) - assert check_count == 0, \ - " !! Negative values in text_input: {}".format(check_count) + assert check_count == 0, " !! Negative values in text_input: {}".format(check_count) # TODO: more assertion here assert isinstance(speaker_name[0], str) assert linear_input.shape[0] == c.batch_size assert linear_input.shape[2] == self.ap.fft_size // 2 + 1 assert mel_input.shape[0] == c.batch_size - assert mel_input.shape[2] == c.audio['num_mels'] + assert mel_input.shape[2] == c.audio["num_mels"] # check normalization ranges if self.ap.symmetric_norm: assert mel_input.max() <= self.ap.max_norm - assert mel_input.min() >= -self.ap.max_norm #pylint: disable=invalid-unary-operand-type + assert mel_input.min() >= -self.ap.max_norm # pylint: disable=invalid-unary-operand-type assert mel_input.min() < 0 else: assert mel_input.max() <= self.ap.max_norm @@ -134,7 +135,7 @@ class TestTTSDataset(unittest.TestCase): # check mel_spec consistency wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32) - mel = self.ap.melspectrogram(wav).astype('float32') + mel = self.ap.melspectrogram(wav).astype("float32") mel = torch.FloatTensor(mel).contiguous() mel_dl = mel_input[0] # NOTE: Below needs to check == 0 but due to an unknown reason @@ -145,15 +146,14 @@ class TestTTSDataset(unittest.TestCase): # check mel-spec correctness mel_spec = mel_input[0].cpu().numpy() wav = self.ap.inv_melspectrogram(mel_spec.T) - self.ap.save_wav(wav, OUTPATH + '/mel_inv_dataloader.wav') - shutil.copy(item_idx[0], OUTPATH + '/mel_target_dataloader.wav') + self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav") + shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav") # check linear-spec linear_spec = linear_input[0].cpu().numpy() wav = self.ap.inv_spectrogram(linear_spec.T) - self.ap.save_wav(wav, OUTPATH + '/linear_inv_dataloader.wav') - shutil.copy(item_idx[0], - OUTPATH + '/linear_target_dataloader.wav') + self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav") + shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav") # check the last time step to be zero padded assert linear_input[0, -1].sum() != 0 @@ -202,8 +202,8 @@ class TestTTSDataset(unittest.TestCase): # check the second itme in the batch assert linear_input[1 - idx, -1].sum() == 0 assert mel_input[1 - idx, -1].sum() == 0 - assert stop_target[1, mel_lengths[1]-1] == 1 - assert stop_target[1, mel_lengths[1]:].sum() == 0 + assert stop_target[1, mel_lengths[1] - 1] == 1 + assert stop_target[1, mel_lengths[1] :].sum() == 0 assert len(mel_lengths.shape) == 1 # check batch zero-frame conditions (zero-frame disabled) diff --git a/tests/test_preprocessors.py b/tests/test_preprocessors.py index c120018d..968e2a29 100644 --- a/tests/test_preprocessors.py +++ b/tests/test_preprocessors.py @@ -6,12 +6,11 @@ from TTS.tts.datasets.preprocess import common_voice class TestPreprocessors(unittest.TestCase): - - def test_common_voice_preprocessor(self): #pylint: disable=no-self-use + def test_common_voice_preprocessor(self): # pylint: disable=no-self-use root_path = get_tests_input_path() meta_file = "common_voice.tsv" items = common_voice(root_path, meta_file) - assert items[0][0] == 'The applicants are invited for coffee and visa is given immediately.' + assert items[0][0] == "The applicants are invited for coffee and visa is given immediately." assert items[0][1] == os.path.join(get_tests_input_path(), "clips", "common_voice_en_20005954.wav") assert items[-1][0] == "Competition for limited resources has also resulted in some local conflicts." diff --git a/tests/test_speaker_encoder.py b/tests/test_speaker_encoder.py index 77f3b54c..32ba2924 100644 --- a/tests/test_speaker_encoder.py +++ b/tests/test_speaker_encoder.py @@ -17,9 +17,7 @@ class SpeakerEncoderTests(unittest.TestCase): def test_in_out(self): dummy_input = T.rand(4, 20, 80) # B x T x D dummy_hidden = [T.rand(2, 4, 128), T.rand(2, 4, 128)] - model = SpeakerEncoder( - input_dim=80, proj_dim=256, lstm_dim=768, num_lstm_layers=3 - ) + model = SpeakerEncoder(input_dim=80, proj_dim=256, lstm_dim=768, num_lstm_layers=3) # computing d vectors output = model.forward(dummy_input) assert output.shape[0] == 4 @@ -36,9 +34,7 @@ class SpeakerEncoderTests(unittest.TestCase): output_norm = T.nn.functional.normalize(output, dim=1, p=2) assert_diff = (output_norm - output).sum().item() assert output.type() == "torch.FloatTensor" - assert ( - abs(assert_diff) < 1e-4 - ), f" [!] output_norm has wrong values - {assert_diff}" + assert abs(assert_diff) < 1e-4, f" [!] output_norm has wrong values - {assert_diff}" # compute d for a given batch dummy_input = T.rand(1, 240, 80) # B x T x D output = model.compute_embedding(dummy_input, num_frames=160, overlap=0.5) @@ -74,6 +70,7 @@ class GE2ELossTests(unittest.TestCase): output = loss.forward(dummy_input) assert output.item() < 0.005 + class AngleProtoLossTests(unittest.TestCase): # pylint: disable=R0201 def test_in_out(self): @@ -103,6 +100,7 @@ class AngleProtoLossTests(unittest.TestCase): output = loss.forward(dummy_input) assert output.item() < 0.005 + # class LoaderTest(unittest.TestCase): # def test_output(self): # items = libri_tts("/home/erogol/Data/Libri-TTS/train-clean-360/") diff --git a/tests/test_speedy_speech_layers.py b/tests/test_speedy_speech_layers.py index 51a2450a..3473769b 100644 --- a/tests/test_speedy_speech_layers.py +++ b/tests/test_speedy_speech_layers.py @@ -10,11 +10,10 @@ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") def test_duration_predictor(): input_dummy = torch.rand(8, 128, 27).to(device) - input_lengths = torch.randint(20, 27, (8, )).long().to(device) + input_lengths = torch.randint(20, 27, (8,)).long().to(device) input_lengths[-1] = 27 - x_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)), - 1).to(device) + x_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device) layer = DurationPredictor(hidden_channels=128).to(device) @@ -29,7 +28,7 @@ def test_speedy_speech(): T_de = 74 x_dummy = torch.randint(0, 7, (B, T_en)).long().to(device) - x_lengths = torch.randint(31, T_en, (B, )).long().to(device) + x_lengths = torch.randint(31, T_en, (B,)).long().to(device) x_lengths[-1] = T_en # set durations. max total duration should be equal to T_de @@ -53,34 +52,18 @@ def test_speedy_speech(): assert list(o_dr.shape) == [B, T_en] # with speaker embedding - model = SpeedySpeech(num_chars, - out_channels=80, - hidden_channels=128, - num_speakers=10, - c_in_channels=256).to(device) - model.forward(x_dummy, - x_lengths, - y_lengths, - durations, - g=torch.randint(0, 10, (B,)).to(device)) + model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128, num_speakers=10, c_in_channels=256).to(device) + model.forward(x_dummy, x_lengths, y_lengths, durations, g=torch.randint(0, 10, (B,)).to(device)) assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}" assert list(attn.shape) == [B, T_de, T_en] assert list(o_dr.shape) == [B, T_en] - # with speaker external embedding - model = SpeedySpeech(num_chars, - out_channels=80, - hidden_channels=128, - num_speakers=10, - external_c=True, - c_in_channels=256).to(device) - model.forward(x_dummy, - x_lengths, - y_lengths, - durations, - g=torch.rand((B, 256)).to(device)) + model = SpeedySpeech( + num_chars, out_channels=80, hidden_channels=128, num_speakers=10, external_c=True, c_in_channels=256 + ).to(device) + model.forward(x_dummy, x_lengths, y_lengths, durations, g=torch.rand((B, 256)).to(device)) assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}" assert list(attn.shape) == [B, T_de, T_en] diff --git a/tests/test_symbols.py b/tests/test_symbols.py index 0c24f124..49b25986 100644 --- a/tests/test_symbols.py +++ b/tests/test_symbols.py @@ -4,5 +4,5 @@ from TTS.tts.utils.text import phonemes class SymbolsTest(unittest.TestCase): - def test_uniqueness(self): #pylint: disable=no-self-use + def test_uniqueness(self): # pylint: disable=no-self-use assert sorted(phonemes) == sorted(list(set(phonemes))), " {} vs {} ".format(len(phonemes), len(set(phonemes))) diff --git a/tests/test_synthesizer.py b/tests/test_synthesizer.py index 1c2c23b2..46b9ab74 100644 --- a/tests/test_synthesizer.py +++ b/tests/test_synthesizer.py @@ -14,8 +14,8 @@ class SynthesizerTest(unittest.TestCase): def _create_random_model(self): # pylint: disable=global-statement global symbols, phonemes - config = load_config(os.path.join(get_tests_output_path(), 'dummy_model_config.json')) - if 'characters' in config.keys(): + config = load_config(os.path.join(get_tests_output_path(), "dummy_model_config.json")) + if "characters" in config.keys(): symbols, phonemes = make_symbols(**config.characters) num_chars = len(phonemes) if config.use_phonemes else len(symbols) @@ -25,11 +25,11 @@ class SynthesizerTest(unittest.TestCase): def test_in_out(self): self._create_random_model() - config = load_config(os.path.join(get_tests_input_path(), 'server_config.json')) + config = load_config(os.path.join(get_tests_input_path(), "server_config.json")) tts_root_path = get_tests_output_path() - config['tts_checkpoint'] = os.path.join(tts_root_path, config['tts_checkpoint']) - config['tts_config'] = os.path.join(tts_root_path, config['tts_config']) - synthesizer = Synthesizer(config['tts_checkpoint'], config['tts_config'], None, None) + config["tts_checkpoint"] = os.path.join(tts_root_path, config["tts_checkpoint"]) + config["tts_config"] = os.path.join(tts_root_path, config["tts_config"]) + synthesizer = Synthesizer(config["tts_checkpoint"], config["tts_config"], None, None) synthesizer.tts("Better this test works!!") def test_split_into_sentences(self): @@ -38,20 +38,48 @@ class SynthesizerTest(unittest.TestCase): # pylint: disable=attribute-defined-outside-init self.seg = Synthesizer.get_segmenter("en") sis = Synthesizer.split_into_sentences - assert sis(self, 'Hello. Two sentences') == ['Hello.', 'Two sentences'] - assert sis(self, 'He went to meet the adviser from Scott, Waltman & Co. next morning.') == ['He went to meet the adviser from Scott, Waltman & Co. next morning.'] - assert sis(self, 'Let\'s run it past Sarah and co. They\'ll want to see this.') == ['Let\'s run it past Sarah and co.', 'They\'ll want to see this.'] - assert sis(self, 'Where is Bobby Jr.\'s rabbit?') == ['Where is Bobby Jr.\'s rabbit?'] - assert sis(self, 'Please inform the U.K. authorities right away.') == ['Please inform the U.K. authorities right away.'] - assert sis(self, 'Were David and co. at the event?') == ['Were David and co. at the event?'] - assert sis(self, 'paging dr. green, please come to theatre four immediately.') == ['paging dr. green, please come to theatre four immediately.'] - assert sis(self, 'The email format is Firstname.Lastname@example.com. I think you reversed them.') == ['The email format is Firstname.Lastname@example.com.', 'I think you reversed them.'] - assert sis(self, 'The demo site is: https://top100.example.com/subsection/latestnews.html. Please send us your feedback.') == ['The demo site is: https://top100.example.com/subsection/latestnews.html.', 'Please send us your feedback.'] - assert sis(self, 'Scowling at him, \'You are not done yet!\' she yelled.') == ['Scowling at him, \'You are not done yet!\' she yelled.'] # with the final lowercase "she" we see it's all one sentence - assert sis(self, 'Hey!! So good to see you.') == ['Hey!!', 'So good to see you.'] - assert sis(self, 'He went to Yahoo! but I don\'t know the division.') == ['He went to Yahoo! but I don\'t know the division.'] - assert sis(self, 'If you can\'t remember a quote, “at least make up a memorable one that\'s plausible..."') == ['If you can\'t remember a quote, “at least make up a memorable one that\'s plausible..."'] - assert sis(self, 'The address is not google.com.') == ['The address is not google.com.'] - assert sis(self, '1.) The first item 2.) The second item') == ['1.) The first item', '2.) The second item'] - assert sis(self, '1) The first item 2) The second item') == ['1) The first item', '2) The second item'] - assert sis(self, 'a. The first item b. The second item c. The third list item') == ['a. The first item', 'b. The second item', 'c. The third list item'] + assert sis(self, "Hello. Two sentences") == ["Hello.", "Two sentences"] + assert sis(self, "He went to meet the adviser from Scott, Waltman & Co. next morning.") == [ + "He went to meet the adviser from Scott, Waltman & Co. next morning." + ] + assert sis(self, "Let's run it past Sarah and co. They'll want to see this.") == [ + "Let's run it past Sarah and co.", + "They'll want to see this.", + ] + assert sis(self, "Where is Bobby Jr.'s rabbit?") == ["Where is Bobby Jr.'s rabbit?"] + assert sis(self, "Please inform the U.K. authorities right away.") == [ + "Please inform the U.K. authorities right away." + ] + assert sis(self, "Were David and co. at the event?") == ["Were David and co. at the event?"] + assert sis(self, "paging dr. green, please come to theatre four immediately.") == [ + "paging dr. green, please come to theatre four immediately." + ] + assert sis(self, "The email format is Firstname.Lastname@example.com. I think you reversed them.") == [ + "The email format is Firstname.Lastname@example.com.", + "I think you reversed them.", + ] + assert sis( + self, + "The demo site is: https://top100.example.com/subsection/latestnews.html. Please send us your feedback.", + ) == [ + "The demo site is: https://top100.example.com/subsection/latestnews.html.", + "Please send us your feedback.", + ] + assert sis(self, "Scowling at him, 'You are not done yet!' she yelled.") == [ + "Scowling at him, 'You are not done yet!' she yelled." + ] # with the final lowercase "she" we see it's all one sentence + assert sis(self, "Hey!! So good to see you.") == ["Hey!!", "So good to see you."] + assert sis(self, "He went to Yahoo! but I don't know the division.") == [ + "He went to Yahoo! but I don't know the division." + ] + assert sis(self, "If you can't remember a quote, “at least make up a memorable one that's plausible...\"") == [ + "If you can't remember a quote, “at least make up a memorable one that's plausible...\"" + ] + assert sis(self, "The address is not google.com.") == ["The address is not google.com."] + assert sis(self, "1.) The first item 2.) The second item") == ["1.) The first item", "2.) The second item"] + assert sis(self, "1) The first item 2) The second item") == ["1) The first item", "2) The second item"] + assert sis(self, "a. The first item b. The second item c. The third list item") == [ + "a. The first item", + "b. The second item", + "c. The third list item", + ] diff --git a/tests/test_tacotron2_model.py b/tests/test_tacotron2_model.py index fb811eaa..0e35605f 100644 --- a/tests/test_tacotron2_model.py +++ b/tests/test_tacotron2_model.py @@ -11,13 +11,13 @@ from TTS.tts.models.tacotron2 import Tacotron2 from TTS.utils.audio import AudioProcessor from TTS.utils.io import load_config -#pylint: disable=unused-variable +# pylint: disable=unused-variable torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -c = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) +c = load_config(os.path.join(get_tests_input_path(), "test_config.json")) ap = AudioProcessor(**c.audio) WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") @@ -26,20 +26,19 @@ WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") class TacotronTrainTest(unittest.TestCase): def test_train_step(self): # pylint: disable=no-self-use input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) - input_lengths = torch.randint(100, 128, (8, )).long().to(device) + input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) - speaker_ids = torch.randint(0, 5, (8, )).long().to(device) + speaker_ids = torch.randint(0, 5, (8,)).long().to(device) for idx in mel_lengths: - stop_targets[:, int(idx.item()):, 0] = 1.0 + stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], - stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) @@ -48,14 +47,14 @@ class TacotronTrainTest(unittest.TestCase): model.train() model_ref = copy.deepcopy(model) count = 0 - for param, param_ref in zip(model.parameters(), - model_ref.parameters()): + for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids) + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + ) assert torch.sigmoid(stop_tokens).data.max() <= 1.0 assert torch.sigmoid(stop_tokens).data.min() >= 0.0 optimizer.zero_grad() @@ -66,13 +65,12 @@ class TacotronTrainTest(unittest.TestCase): optimizer.step() # check parameter changes count = 0 - for param, param_ref in zip(model.parameters(), - model_ref.parameters()): + for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert (param != param_ref).any( - ), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref) + assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref + ) count += 1 @@ -80,20 +78,19 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) - input_lengths = torch.randint(100, 128, (8, )).long().to(device) + input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) speaker_embeddings = torch.rand(8, 55).to(device) for idx in mel_lengths: - stop_targets[:, int(idx.item()):, 0] = 1.0 + stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], - stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) @@ -102,14 +99,14 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): model.train() model_ref = copy.deepcopy(model) count = 0 - for param, param_ref in zip(model.parameters(), - model_ref.parameters()): + for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings) + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings + ) assert torch.sigmoid(stop_tokens).data.max() <= 1.0 assert torch.sigmoid(stop_tokens).data.min() >= 0.0 optimizer.zero_grad() @@ -120,39 +117,46 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer.step() # check parameter changes count = 0 - for param, param_ref in zip(model.parameters(), - model_ref.parameters()): + for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert (param != param_ref).any( - ), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref) + assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref + ) count += 1 + class TacotronGSTTrainTest(unittest.TestCase): - #pylint: disable=no-self-use + # pylint: disable=no-self-use def test_train_step(self): # with random gst mel style input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) - input_lengths = torch.randint(100, 128, (8, )).long().to(device) + input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) - speaker_ids = torch.randint(0, 5, (8, )).long().to(device) + speaker_ids = torch.randint(0, 5, (8,)).long().to(device) for idx in mel_lengths: - stop_targets[:, int(idx.item()):, 0] = 1.0 + stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], - stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, gst=True, gst_embedding_dim=c.gst['gst_embedding_dim'], gst_num_heads=c.gst['gst_num_heads'], gst_style_tokens=c.gst['gst_style_tokens']).to(device) + model = Tacotron2( + num_chars=24, + r=c.r, + num_speakers=5, + gst=True, + gst_embedding_dim=c.gst["gst_embedding_dim"], + gst_num_heads=c.gst["gst_num_heads"], + gst_style_tokens=c.gst["gst_style_tokens"], + ).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 @@ -162,7 +166,8 @@ class TacotronGSTTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(10): mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids) + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + ) assert torch.sigmoid(stop_tokens).data.max() <= 1.0 assert torch.sigmoid(stop_tokens).data.min() >= 0.0 optimizer.zero_grad() @@ -177,36 +182,45 @@ class TacotronGSTTrainTest(unittest.TestCase): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: name, param = name_param - if name == 'gst_layer.encoder.recurrence.weight_hh_l0': - #print(param.grad) + if name == "gst_layer.encoder.recurrence.weight_hh_l0": + # print(param.grad) continue - assert (param != param_ref).any( - ), "param {} {} with shape {} not updated!! \n{}\n{}".format( - name, count, param.shape, param, param_ref) + assert (param != param_ref).any(), "param {} {} with shape {} not updated!! \n{}\n{}".format( + name, count, param.shape, param, param_ref + ) count += 1 # with file gst style - mel_spec = torch.FloatTensor(ap.melspectrogram(ap.load_wav(WAV_FILE)))[:, :30].unsqueeze(0).transpose(1, 2).to(device) + mel_spec = ( + torch.FloatTensor(ap.melspectrogram(ap.load_wav(WAV_FILE)))[:, :30].unsqueeze(0).transpose(1, 2).to(device) + ) mel_spec = mel_spec.repeat(8, 1, 1) input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) - input_lengths = torch.randint(100, 128, (8, )).long().to(device) + input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) - speaker_ids = torch.randint(0, 5, (8, )).long().to(device) + speaker_ids = torch.randint(0, 5, (8,)).long().to(device) for idx in mel_lengths: - stop_targets[:, int(idx.item()):, 0] = 1.0 + stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], - stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, gst=True, gst_embedding_dim=c.gst['gst_embedding_dim'], gst_num_heads=c.gst['gst_num_heads'], gst_style_tokens=c.gst['gst_style_tokens']).to(device) + model = Tacotron2( + num_chars=24, + r=c.r, + num_speakers=5, + gst=True, + gst_embedding_dim=c.gst["gst_embedding_dim"], + gst_num_heads=c.gst["gst_num_heads"], + gst_style_tokens=c.gst["gst_style_tokens"], + ).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 @@ -216,7 +230,8 @@ class TacotronGSTTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(10): mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids) + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + ) assert torch.sigmoid(stop_tokens).data.max() <= 1.0 assert torch.sigmoid(stop_tokens).data.min() >= 0.0 optimizer.zero_grad() @@ -231,47 +246,57 @@ class TacotronGSTTrainTest(unittest.TestCase): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: name, param = name_param - if name == 'gst_layer.encoder.recurrence.weight_hh_l0': - #print(param.grad) + if name == "gst_layer.encoder.recurrence.weight_hh_l0": + # print(param.grad) continue - assert (param != param_ref).any( - ), "param {} {} with shape {} not updated!! \n{}\n{}".format( - name, count, param.shape, param, param_ref) + assert (param != param_ref).any(), "param {} {} with shape {} not updated!! \n{}\n{}".format( + name, count, param.shape, param, param_ref + ) count += 1 + class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) - input_lengths = torch.randint(100, 128, (8, )).long().to(device) + input_lengths = torch.randint(100, 128, (8,)).long().to(device) input_lengths = torch.sort(input_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[0] = 30 stop_targets = torch.zeros(8, 30, 1).float().to(device) speaker_embeddings = torch.rand(8, 55).to(device) for idx in mel_lengths: - stop_targets[:, int(idx.item()):, 0] = 1.0 + stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], - stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, speaker_embedding_dim=55, gst=True, gst_embedding_dim=c.gst['gst_embedding_dim'], gst_num_heads=c.gst['gst_num_heads'], gst_style_tokens=c.gst['gst_style_tokens'], gst_use_speaker_embedding=c.gst['gst_use_speaker_embedding']).to(device) + model = Tacotron2( + num_chars=24, + r=c.r, + num_speakers=5, + speaker_embedding_dim=55, + gst=True, + gst_embedding_dim=c.gst["gst_embedding_dim"], + gst_num_heads=c.gst["gst_num_heads"], + gst_style_tokens=c.gst["gst_style_tokens"], + gst_use_speaker_embedding=c.gst["gst_use_speaker_embedding"], + ).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 - for param, param_ref in zip(model.parameters(), - model_ref.parameters()): + for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): mel_out, mel_postnet_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings) + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings + ) assert torch.sigmoid(stop_tokens).data.max() <= 1.0 assert torch.sigmoid(stop_tokens).data.min() >= 0.0 optimizer.zero_grad() @@ -282,14 +307,13 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer.step() # check parameter changes count = 0 - for name_param, param_ref in zip(model.named_parameters(), - model_ref.parameters()): + for name_param, param_ref in zip(model.named_parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: name, param = name_param - if name == 'gst_layer.encoder.recurrence.weight_hh_l0': + if name == "gst_layer.encoder.recurrence.weight_hh_l0": continue - assert (param != param_ref).any( - ), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref) + assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref + ) count += 1 diff --git a/tests/test_tacotron2_tf_model.py b/tests/test_tacotron2_tf_model.py index 084b972d..767e5ffc 100644 --- a/tests/test_tacotron2_tf_model.py +++ b/tests/test_tacotron2_tf_model.py @@ -10,48 +10,51 @@ from TTS.tts.tf.models.tacotron2 import Tacotron2 from TTS.tts.tf.utils.tflite import convert_tacotron2_to_tflite, load_tflite_model from TTS.utils.io import load_config -tf.get_logger().setLevel('INFO') +tf.get_logger().setLevel("INFO") - -#pylint: disable=unused-variable +# pylint: disable=unused-variable torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -c = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) +c = load_config(os.path.join(get_tests_input_path(), "test_config.json")) class TacotronTFTrainTest(unittest.TestCase): - @staticmethod def generate_dummy_inputs(): chars_seq = torch.randint(0, 24, (8, 128)).long().to(device) - chars_seq_lengths = torch.randint(100, 128, (8, )).long().to(device) + chars_seq_lengths = torch.randint(100, 128, (8,)).long().to(device) chars_seq_lengths = torch.sort(chars_seq_lengths, descending=True)[0] - mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_postnet_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + mel_lengths = torch.randint(20, 30, (8,)).long().to(device) stop_targets = torch.zeros(8, 30, 1).float().to(device) - speaker_ids = torch.randint(0, 5, (8, )).long().to(device) + speaker_ids = torch.randint(0, 5, (8,)).long().to(device) chars_seq = tf.convert_to_tensor(chars_seq.cpu().numpy()) chars_seq_lengths = tf.convert_to_tensor(chars_seq_lengths.cpu().numpy()) mel_spec = tf.convert_to_tensor(mel_spec.cpu().numpy()) - return chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\ - stop_targets, speaker_ids + return chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths, stop_targets, speaker_ids def test_train_step(self): - ''' test forward pass ''' - chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\ - stop_targets, speaker_ids = self.generate_dummy_inputs() + """ test forward pass """ + ( + chars_seq, + chars_seq_lengths, + mel_spec, + mel_postnet_spec, + mel_lengths, + stop_targets, + speaker_ids, + ) = self.generate_dummy_inputs() for idx in mel_lengths: - stop_targets[:, int(idx.item()):, 0] = 1.0 + stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(chars_seq.shape[0], - stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(chars_seq.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() model = Tacotron2(num_chars=24, r=c.r, num_speakers=5) @@ -68,15 +71,23 @@ class TacotronTFTrainTest(unittest.TestCase): # inference pass output = model(chars_seq, training=False) - def test_forward_attention(self,): - chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\ - stop_targets, speaker_ids = self.generate_dummy_inputs() + def test_forward_attention( + self, + ): + ( + chars_seq, + chars_seq_lengths, + mel_spec, + mel_postnet_spec, + mel_lengths, + stop_targets, + speaker_ids, + ) = self.generate_dummy_inputs() for idx in mel_lengths: - stop_targets[:, int(idx.item()):, 0] = 1.0 + stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(chars_seq.shape[0], - stop_targets.size(1) // c.r, -1) + stop_targets = stop_targets.view(chars_seq.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, forward_attn=True) @@ -93,45 +104,51 @@ class TacotronTFTrainTest(unittest.TestCase): # inference pass output = model(chars_seq, training=False) - def test_tflite_conversion(self, ): #pylint:disable=no-self-use - model = Tacotron2(num_chars=24, - num_speakers=0, - r=3, - postnet_output_dim=80, - decoder_output_dim=80, - attn_type='original', - attn_win=False, - attn_norm='sigmoid', - prenet_type='original', - prenet_dropout=True, - forward_attn=False, - trans_agent=False, - forward_attn_mask=False, - location_attn=True, - attn_K=0, - separate_stopnet=True, - bidirectional_decoder=False, - enable_tflite=True) + def test_tflite_conversion( + self, + ): # pylint:disable=no-self-use + model = Tacotron2( + num_chars=24, + num_speakers=0, + r=3, + postnet_output_dim=80, + decoder_output_dim=80, + attn_type="original", + attn_win=False, + attn_norm="sigmoid", + prenet_type="original", + prenet_dropout=True, + forward_attn=False, + trans_agent=False, + forward_attn_mask=False, + location_attn=True, + attn_K=0, + separate_stopnet=True, + bidirectional_decoder=False, + enable_tflite=True, + ) model.build_inference() - convert_tacotron2_to_tflite(model, output_path='test_tacotron2.tflite', experimental_converter=True) + convert_tacotron2_to_tflite(model, output_path="test_tacotron2.tflite", experimental_converter=True) # init tflite model - tflite_model = load_tflite_model('test_tacotron2.tflite') + tflite_model = load_tflite_model("test_tacotron2.tflite") # fake input - inputs = tf.random.uniform([1, 4], maxval=10, dtype=tf.int32) #pylint:disable=unexpected-keyword-arg + inputs = tf.random.uniform([1, 4], maxval=10, dtype=tf.int32) # pylint:disable=unexpected-keyword-arg # run inference # get input and output details input_details = tflite_model.get_input_details() output_details = tflite_model.get_output_details() # reshape input tensor for the new input shape - tflite_model.resize_tensor_input(input_details[0]['index'], inputs.shape) #pylint:disable=unexpected-keyword-arg + tflite_model.resize_tensor_input( + input_details[0]["index"], inputs.shape + ) # pylint:disable=unexpected-keyword-arg tflite_model.allocate_tensors() detail = input_details[0] - input_shape = detail['shape'] - tflite_model.set_tensor(detail['index'], inputs) + input_shape = detail["shape"] + tflite_model.set_tensor(detail["index"], inputs) # run the tflite_model tflite_model.invoke() # collect outputs - decoder_output = tflite_model.get_tensor(output_details[0]['index']) - postnet_output = tflite_model.get_tensor(output_details[1]['index']) + decoder_output = tflite_model.get_tensor(output_details[0]["index"]) + postnet_output = tflite_model.get_tensor(output_details[1]["index"]) # remove tflite binary - os.remove('test_tacotron2.tflite') + os.remove("test_tacotron2.tflite") diff --git a/tests/test_tacotron_model.py b/tests/test_tacotron_model.py index 0af8dab4..e3ed8ae2 100644 --- a/tests/test_tacotron_model.py +++ b/tests/test_tacotron_model.py @@ -11,13 +11,13 @@ from TTS.tts.models.tacotron import Tacotron from TTS.utils.audio import AudioProcessor from TTS.utils.io import load_config -#pylint: disable=unused-variable +# pylint: disable=unused-variable torch.manual_seed(1) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -c = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) +c = load_config(os.path.join(get_tests_input_path(), "test_config.json")) ap = AudioProcessor(**c.audio) WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") @@ -32,147 +32,140 @@ class TacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) - input_lengths = torch.randint(100, 129, (8, )).long().to(device) + input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - linear_spec = torch.rand(8, 30, c.audio['fft_size']).to(device) - mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device) + mel_lengths = torch.randint(20, 30, (8,)).long().to(device) stop_targets = torch.zeros(8, 30, 1).float().to(device) - speaker_ids = torch.randint(0, 5, (8, )).long().to(device) + speaker_ids = torch.randint(0, 5, (8,)).long().to(device) for idx in mel_lengths: - stop_targets[:, int(idx.item()):, 0] = 1.0 + stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], - stop_targets.size(1) // c.r, -1) - stop_targets = (stop_targets.sum(2) > - 0.0).unsqueeze(2).float().squeeze() + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) model = Tacotron( num_chars=32, num_speakers=5, - postnet_output_dim=c.audio['fft_size'], - decoder_output_dim=c.audio['num_mels'], + postnet_output_dim=c.audio["fft_size"], + decoder_output_dim=c.audio["num_mels"], r=c.r, - memory_size=c.memory_size - ).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor + memory_size=c.memory_size, + ).to( + device + ) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() - print(" > Num parameters for Tacotron model:%s" % - (count_parameters(model))) + print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) model_ref = copy.deepcopy(model) count = 0 - for param, param_ref in zip(model.parameters(), - model_ref.parameters()): + for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids) + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + ) optimizer.zero_grad() loss = criterion(mel_out, mel_spec, mel_lengths) stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, - mel_lengths) + stop_loss + loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes count = 0 - for param, param_ref in zip(model.parameters(), - model_ref.parameters()): + for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert (param != param_ref).any( - ), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref) + assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref + ) count += 1 + class MultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) - input_lengths = torch.randint(100, 129, (8, )).long().to(device) + input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - linear_spec = torch.rand(8, 30, c.audio['fft_size']).to(device) - mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device) + mel_lengths = torch.randint(20, 30, (8,)).long().to(device) stop_targets = torch.zeros(8, 30, 1).float().to(device) speaker_embeddings = torch.rand(8, 55).to(device) for idx in mel_lengths: - stop_targets[:, int(idx.item()):, 0] = 1.0 + stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], - stop_targets.size(1) // c.r, -1) - stop_targets = (stop_targets.sum(2) > - 0.0).unsqueeze(2).float().squeeze() + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) model = Tacotron( num_chars=32, num_speakers=5, - postnet_output_dim=c.audio['fft_size'], - decoder_output_dim=c.audio['num_mels'], + postnet_output_dim=c.audio["fft_size"], + decoder_output_dim=c.audio["num_mels"], r=c.r, memory_size=c.memory_size, speaker_embedding_dim=55, - ).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor + ).to( + device + ) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() - print(" > Num parameters for Tacotron model:%s" % - (count_parameters(model))) + print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) model_ref = copy.deepcopy(model) count = 0 - for param, param_ref in zip(model.parameters(), - model_ref.parameters()): + for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, - speaker_embeddings=speaker_embeddings) + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings + ) optimizer.zero_grad() loss = criterion(mel_out, mel_spec, mel_lengths) stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, - mel_lengths) + stop_loss + loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes count = 0 - for param, param_ref in zip(model.parameters(), - model_ref.parameters()): + for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert (param != param_ref).any( - ), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref) + assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref + ) count += 1 + class TacotronGSTTrainTest(unittest.TestCase): @staticmethod def test_train_step(): # with random gst mel style input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) - input_lengths = torch.randint(100, 129, (8, )).long().to(device) + input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 120, c.audio['num_mels']).to(device) - linear_spec = torch.rand(8, 120, c.audio['fft_size']).to(device) - mel_lengths = torch.randint(20, 120, (8, )).long().to(device) + mel_spec = torch.rand(8, 120, c.audio["num_mels"]).to(device) + linear_spec = torch.rand(8, 120, c.audio["fft_size"]).to(device) + mel_lengths = torch.randint(20, 120, (8,)).long().to(device) mel_lengths[-1] = 120 stop_targets = torch.zeros(8, 120, 1).float().to(device) - speaker_ids = torch.randint(0, 5, (8, )).long().to(device) + speaker_ids = torch.randint(0, 5, (8,)).long().to(device) for idx in mel_lengths: - stop_targets[:, int(idx.item()):, 0] = 1.0 + stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], - stop_targets.size(1) // c.r, -1) - stop_targets = (stop_targets.sum(2) > - 0.0).unsqueeze(2).float().squeeze() + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) @@ -180,65 +173,64 @@ class TacotronGSTTrainTest(unittest.TestCase): num_chars=32, num_speakers=5, gst=True, - gst_embedding_dim=c.gst['gst_embedding_dim'], - gst_num_heads=c.gst['gst_num_heads'], - gst_style_tokens=c.gst['gst_style_tokens'], - postnet_output_dim=c.audio['fft_size'], - decoder_output_dim=c.audio['num_mels'], + gst_embedding_dim=c.gst["gst_embedding_dim"], + gst_num_heads=c.gst["gst_num_heads"], + gst_style_tokens=c.gst["gst_style_tokens"], + postnet_output_dim=c.audio["fft_size"], + decoder_output_dim=c.audio["num_mels"], r=c.r, - memory_size=c.memory_size - ).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor + memory_size=c.memory_size, + ).to( + device + ) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() # print(model) - print(" > Num parameters for Tacotron GST model:%s" % - (count_parameters(model))) + print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model))) model_ref = copy.deepcopy(model) count = 0 - for param, param_ref in zip(model.parameters(), - model_ref.parameters()): + for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(10): mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids) + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + ) optimizer.zero_grad() loss = criterion(mel_out, mel_spec, mel_lengths) stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, - mel_lengths) + stop_loss + loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes count = 0 - for param, param_ref in zip(model.parameters(), - model_ref.parameters()): + for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert (param != param_ref).any( - ), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref) + assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref + ) count += 1 # with file gst style - mel_spec = torch.FloatTensor(ap.melspectrogram(ap.load_wav(WAV_FILE)))[:, :120].unsqueeze(0).transpose(1, 2).to(device) + mel_spec = ( + torch.FloatTensor(ap.melspectrogram(ap.load_wav(WAV_FILE)))[:, :120].unsqueeze(0).transpose(1, 2).to(device) + ) mel_spec = mel_spec.repeat(8, 1, 1) input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) - input_lengths = torch.randint(100, 129, (8, )).long().to(device) + input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - linear_spec = torch.rand(8, mel_spec.size(1), c.audio['fft_size']).to(device) - mel_lengths = torch.randint(20, mel_spec.size(1), (8, )).long().to(device) + linear_spec = torch.rand(8, mel_spec.size(1), c.audio["fft_size"]).to(device) + mel_lengths = torch.randint(20, mel_spec.size(1), (8,)).long().to(device) mel_lengths[-1] = mel_spec.size(1) stop_targets = torch.zeros(8, mel_spec.size(1), 1).float().to(device) - speaker_ids = torch.randint(0, 5, (8, )).long().to(device) + speaker_ids = torch.randint(0, 5, (8,)).long().to(device) for idx in mel_lengths: - stop_targets[:, int(idx.item()):, 0] = 1.0 + stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], - stop_targets.size(1) // c.r, -1) - stop_targets = (stop_targets.sum(2) > - 0.0).unsqueeze(2).float().squeeze() + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) @@ -246,113 +238,109 @@ class TacotronGSTTrainTest(unittest.TestCase): num_chars=32, num_speakers=5, gst=True, - gst_embedding_dim=c.gst['gst_embedding_dim'], - gst_num_heads=c.gst['gst_num_heads'], - gst_style_tokens=c.gst['gst_style_tokens'], - postnet_output_dim=c.audio['fft_size'], - decoder_output_dim=c.audio['num_mels'], + gst_embedding_dim=c.gst["gst_embedding_dim"], + gst_num_heads=c.gst["gst_num_heads"], + gst_style_tokens=c.gst["gst_style_tokens"], + postnet_output_dim=c.audio["fft_size"], + decoder_output_dim=c.audio["num_mels"], r=c.r, - memory_size=c.memory_size - ).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor + memory_size=c.memory_size, + ).to( + device + ) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() # print(model) - print(" > Num parameters for Tacotron GST model:%s" % - (count_parameters(model))) + print(" > Num parameters for Tacotron GST model:%s" % (count_parameters(model))) model_ref = copy.deepcopy(model) count = 0 - for param, param_ref in zip(model.parameters(), - model_ref.parameters()): + for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(10): mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids) + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids + ) optimizer.zero_grad() loss = criterion(mel_out, mel_spec, mel_lengths) stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, - mel_lengths) + stop_loss + loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes count = 0 - for param, param_ref in zip(model.parameters(), - model_ref.parameters()): + for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional - assert (param != param_ref).any( - ), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref) + assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref + ) count += 1 + class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): input_dummy = torch.randint(0, 24, (8, 128)).long().to(device) - input_lengths = torch.randint(100, 129, (8, )).long().to(device) + input_lengths = torch.randint(100, 129, (8,)).long().to(device) input_lengths[-1] = 128 - mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) - linear_spec = torch.rand(8, 30, c.audio['fft_size']).to(device) - mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + mel_spec = torch.rand(8, 30, c.audio["num_mels"]).to(device) + linear_spec = torch.rand(8, 30, c.audio["fft_size"]).to(device) + mel_lengths = torch.randint(20, 30, (8,)).long().to(device) mel_lengths[-1] = mel_spec.size(1) stop_targets = torch.zeros(8, 30, 1).float().to(device) speaker_embeddings = torch.rand(8, 55).to(device) for idx in mel_lengths: - stop_targets[:, int(idx.item()):, 0] = 1.0 + stop_targets[:, int(idx.item()) :, 0] = 1.0 - stop_targets = stop_targets.view(input_dummy.shape[0], - stop_targets.size(1) // c.r, -1) - stop_targets = (stop_targets.sum(2) > - 0.0).unsqueeze(2).float().squeeze() + stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // c.r, -1) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = L1LossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) model = Tacotron( num_chars=32, num_speakers=5, - postnet_output_dim=c.audio['fft_size'], - decoder_output_dim=c.audio['num_mels'], + postnet_output_dim=c.audio["fft_size"], + decoder_output_dim=c.audio["num_mels"], gst=True, - gst_embedding_dim=c.gst['gst_embedding_dim'], - gst_num_heads=c.gst['gst_num_heads'], - gst_style_tokens=c.gst['gst_style_tokens'], - gst_use_speaker_embedding=c.gst['gst_use_speaker_embedding'], + gst_embedding_dim=c.gst["gst_embedding_dim"], + gst_num_heads=c.gst["gst_num_heads"], + gst_style_tokens=c.gst["gst_style_tokens"], + gst_use_speaker_embedding=c.gst["gst_use_speaker_embedding"], r=c.r, memory_size=c.memory_size, speaker_embedding_dim=55, - ).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor + ).to( + device + ) # FIXME: missing num_speakers parameter to Tacotron ctor model.train() - print(" > Num parameters for Tacotron model:%s" % - (count_parameters(model))) + print(" > Num parameters for Tacotron model:%s" % (count_parameters(model))) model_ref = copy.deepcopy(model) count = 0 - for param, param_ref in zip(model.parameters(), - model_ref.parameters()): + for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): mel_out, linear_out, align, stop_tokens = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, - speaker_embeddings=speaker_embeddings) + input_dummy, input_lengths, mel_spec, mel_lengths, speaker_embeddings=speaker_embeddings + ) optimizer.zero_grad() loss = criterion(mel_out, mel_spec, mel_lengths) stop_loss = criterion_st(stop_tokens, stop_targets) - loss = loss + criterion(linear_out, linear_spec, - mel_lengths) + stop_loss + loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss loss.backward() optimizer.step() # check parameter changes count = 0 - for name_param, param_ref in zip(model.named_parameters(), - model_ref.parameters()): + for name_param, param_ref in zip(model.named_parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: name, param = name_param - if name == 'gst_layer.encoder.recurrence.weight_hh_l0': + if name == "gst_layer.encoder.recurrence.weight_hh_l0": continue - assert (param != param_ref).any( - ), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref) + assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref + ) count += 1 diff --git a/tests/test_text_cleaners.py b/tests/test_text_cleaners.py index b301fb5a..fcfa71e7 100644 --- a/tests/test_text_cleaners.py +++ b/tests/test_text_cleaners.py @@ -17,5 +17,5 @@ def test_currency() -> None: def test_expand_numbers() -> None: - assert phoneme_cleaners("-1") == 'minus one' - assert phoneme_cleaners("1") == 'one' + assert phoneme_cleaners("-1") == "minus one" + assert phoneme_cleaners("1") == "one" diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py index b8b74e28..f70056b1 100644 --- a/tests/test_text_processing.py +++ b/tests/test_text_processing.py @@ -7,7 +7,8 @@ from tests import get_tests_input_path, get_tests_path from TTS.tts.utils.text import * from TTS.utils.io import load_config -conf = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) +conf = load_config(os.path.join(get_tests_input_path(), "test_config.json")) + def test_phoneme_to_sequence(): @@ -18,7 +19,7 @@ def test_phoneme_to_sequence(): text_hat = sequence_to_phoneme(sequence) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) - gt = 'ɹiːsənt ɹᵻsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪŋkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹᵻspɑːnsᵻbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjʊleɪʃən ænd lɜːnɪŋ!' + gt = "ɹiːsənt ɹᵻsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪŋkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹᵻspɑːnsᵻbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjʊleɪʃən ænd lɜːnɪŋ!" assert text_hat == text_hat_with_params == gt # multiple punctuations @@ -87,6 +88,7 @@ def test_phoneme_to_sequence(): print(len(sequence)) assert text_hat == text_hat_with_params == gt + def test_phoneme_to_sequence_with_blank_token(): text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" @@ -105,7 +107,7 @@ def test_phoneme_to_sequence_with_blank_token(): text_hat = sequence_to_phoneme(sequence) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) - gt = 'biː ɐ vɔɪs, nɑːt æn! ɛkoʊ?' + gt = "biː ɐ vɔɪs, nɑːt æn! ɛkoʊ?" print(text_hat) print(len(sequence)) assert text_hat == text_hat_with_params == gt @@ -116,7 +118,7 @@ def test_phoneme_to_sequence_with_blank_token(): text_hat = sequence_to_phoneme(sequence) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) - gt = 'biː ɐ vɔɪs, nɑːt æn! ɛkoʊ' + gt = "biː ɐ vɔɪs, nɑːt æn! ɛkoʊ" print(text_hat) print(len(sequence)) assert text_hat == text_hat_with_params == gt @@ -127,7 +129,7 @@ def test_phoneme_to_sequence_with_blank_token(): text_hat = sequence_to_phoneme(sequence) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) - gt = 'biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!' + gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!" print(text_hat) print(len(sequence)) assert text_hat == text_hat_with_params == gt @@ -138,7 +140,7 @@ def test_phoneme_to_sequence_with_blank_token(): text_hat = sequence_to_phoneme(sequence) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) - gt = 'biː ɐ vɔɪs, nɑːt æn! ɛkoʊ.' + gt = "biː ɐ vɔɪs, nɑːt æn! ɛkoʊ." print(text_hat) print(len(sequence)) assert text_hat == text_hat_with_params == gt @@ -165,9 +167,10 @@ def test_phoneme_to_sequence_with_blank_token(): print(len(sequence)) assert text_hat == text_hat_with_params == gt + def test_text2phone(): text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" - gt = 'ɹ|iː|s|ə|n|t| |ɹ|ᵻ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|ŋ|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ᵻ|s|p|ɑː|n|s|ᵻ|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|ʊ|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!' + gt = "ɹ|iː|s|ə|n|t| |ɹ|ᵻ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|ŋ|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ᵻ|s|p|ɑː|n|s|ᵻ|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|ʊ|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!" lang = "en-us" ph = text2phone(text, lang) assert gt == ph diff --git a/tests/test_vocoder_gan_datasets.py b/tests/test_vocoder_gan_datasets.py index 13b1b9d2..84ddcd93 100644 --- a/tests/test_vocoder_gan_datasets.py +++ b/tests/test_vocoder_gan_datasets.py @@ -13,31 +13,32 @@ file_path = os.path.dirname(os.path.realpath(__file__)) OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") os.makedirs(OUTPATH, exist_ok=True) -C = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) +C = load_config(os.path.join(get_tests_input_path(), "test_config.json")) test_data_path = os.path.join(get_tests_path(), "data/ljspeech/") ok_ljspeech = os.path.exists(test_data_path) -def gan_dataset_case(batch_size, seq_len, hop_len, conv_pad, return_segments, use_noise_augment, use_cache, num_workers): - ''' run dataloader with given parameters and check conditions ''' +def gan_dataset_case( + batch_size, seq_len, hop_len, conv_pad, return_segments, use_noise_augment, use_cache, num_workers +): + """ run dataloader with given parameters and check conditions """ ap = AudioProcessor(**C.audio) _, train_items = load_wav_data(test_data_path, 10) - dataset = GANDataset(ap, - train_items, - seq_len=seq_len, - hop_len=hop_len, - pad_short=2000, - conv_pad=conv_pad, - return_segments=return_segments, - use_noise_augment=use_noise_augment, - use_cache=use_cache) - loader = DataLoader(dataset=dataset, - batch_size=batch_size, - shuffle=True, - num_workers=num_workers, - pin_memory=True, - drop_last=True) + dataset = GANDataset( + ap, + train_items, + seq_len=seq_len, + hop_len=hop_len, + pad_short=2000, + conv_pad=conv_pad, + return_segments=return_segments, + use_noise_augment=use_noise_augment, + use_cache=use_cache, + ) + loader = DataLoader( + dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True, drop_last=True + ) max_iter = 10 count_iter = 0 @@ -61,8 +62,8 @@ def gan_dataset_case(batch_size, seq_len, hop_len, conv_pad, return_segments, us mel = ap.melspectrogram(audio) # the first 2 and the last 2 frames are skipped due to the padding # differences in stft - max_diff = abs((feat - mel[:, :feat1.shape[-1]])[:, 2:-2]).max() - assert max_diff <= 0, f' [!] {max_diff}' + max_diff = abs((feat - mel[:, : feat1.shape[-1]])[:, 2:-2]).max() + assert max_diff <= 0, f" [!] {max_diff}" count_iter += 1 # if count_iter == max_iter: @@ -79,17 +80,17 @@ def gan_dataset_case(batch_size, seq_len, hop_len, conv_pad, return_segments, us def test_parametrized_gan_dataset(): - ''' test dataloader with different parameters ''' + """ test dataloader with different parameters """ params = [ - [32, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, True, False, True, 0], - [32, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, True, False, True, 4], - [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, True, True, True, 0], - [1, C.audio['hop_length'], C.audio['hop_length'], 0, True, True, True, 0], - [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, True, True, True, 0], - [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, False, True, True, 0], - [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, True, False, True, 0], - [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, True, True, False, 0], - [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 0, False, False, False, 0], + [32, C.audio["hop_length"] * 10, C.audio["hop_length"], 0, True, False, True, 0], + [32, C.audio["hop_length"] * 10, C.audio["hop_length"], 0, True, False, True, 4], + [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 0, True, True, True, 0], + [1, C.audio["hop_length"], C.audio["hop_length"], 0, True, True, True, 0], + [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, True, True, True, 0], + [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 0, False, True, True, 0], + [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 0, True, False, True, 0], + [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 0, True, True, False, 0], + [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 0, False, False, False, 0], ] for param in params: print(param) diff --git a/tests/test_vocoder_losses.py b/tests/test_vocoder_losses.py index 7b3c7017..87151a05 100644 --- a/tests/test_vocoder_losses.py +++ b/tests/test_vocoder_losses.py @@ -14,7 +14,7 @@ os.makedirs(OUT_PATH, exist_ok=True) WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") -C = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) +C = load_config(os.path.join(get_tests_input_path(), "test_config.json")) ap = AudioProcessor(**C.audio) @@ -45,7 +45,8 @@ def test_multiscale_stft_loss(): stft_loss = MultiScaleSTFTLoss( [ap.fft_size // 2, ap.fft_size, ap.fft_size * 2], [ap.hop_length // 2, ap.hop_length, ap.hop_length * 2], - [ap.win_length // 2, ap.win_length, ap.win_length * 2]) + [ap.win_length // 2, ap.win_length, ap.win_length * 2], + ) wav = ap.load_wav(WAV_FILE) wav = torch.from_numpy(wav[None, :]).float() loss_m, loss_sc = stft_loss(wav, wav) diff --git a/tests/test_vocoder_parallel_wavegan_discriminator.py b/tests/test_vocoder_parallel_wavegan_discriminator.py index 6496d562..d4eca0d1 100644 --- a/tests/test_vocoder_parallel_wavegan_discriminator.py +++ b/tests/test_vocoder_parallel_wavegan_discriminator.py @@ -1,8 +1,10 @@ import numpy as np import torch -from TTS.vocoder.models.parallel_wavegan_discriminator import (ParallelWaveganDiscriminator, - ResidualParallelWaveganDiscriminator) +from TTS.vocoder.models.parallel_wavegan_discriminator import ( + ParallelWaveganDiscriminator, + ResidualParallelWaveganDiscriminator, +) def test_pwgan_disciminator(): @@ -15,7 +17,8 @@ def test_pwgan_disciminator(): dilation_factor=1, nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.2}, - bias=True) + bias=True, + ) dummy_x = torch.rand((4, 1, 64 * 256)) output = model(dummy_x) assert np.all(output.shape == (4, 1, 64 * 256)) @@ -35,7 +38,8 @@ def test_redisual_pwgan_disciminator(): dropout=0.0, bias=True, nonlinear_activation="LeakyReLU", - nonlinear_activation_params={"negative_slope": 0.2}) + nonlinear_activation_params={"negative_slope": 0.2}, + ) dummy_x = torch.rand((4, 1, 64 * 256)) output = model(dummy_x) assert np.all(output.shape == (4, 1, 64 * 256)) diff --git a/tests/test_vocoder_parallel_wavegan_generator.py b/tests/test_vocoder_parallel_wavegan_generator.py index 9eed0eee..21f6f08f 100644 --- a/tests/test_vocoder_parallel_wavegan_generator.py +++ b/tests/test_vocoder_parallel_wavegan_generator.py @@ -18,7 +18,8 @@ def test_pwgan_generator(): dropout=0.0, bias=True, use_weight_norm=True, - upsample_factors=[4, 4, 4, 4]) + upsample_factors=[4, 4, 4, 4], + ) dummy_c = torch.rand((2, 80, 5)) output = model(dummy_c) assert np.all(output.shape == (2, 1, 5 * 256)), output.shape diff --git a/tests/test_vocoder_pqmf.py b/tests/test_vocoder_pqmf.py index 3112df5a..afe8d1dc 100644 --- a/tests/test_vocoder_pqmf.py +++ b/tests/test_vocoder_pqmf.py @@ -23,5 +23,4 @@ def test_pqmf(): print(w2_.max()) print(w2_.min()) print(w2_.mean()) - sf.write(os.path.join(get_tests_output_path(), 'pqmf_output.wav'), - w2_.flatten().detach(), sr) + sf.write(os.path.join(get_tests_output_path(), "pqmf_output.wav"), w2_.flatten().detach(), sr) diff --git a/tests/test_vocoder_rwd.py b/tests/test_vocoder_rwd.py index 82525e1b..371ad9e4 100644 --- a/tests/test_vocoder_rwd.py +++ b/tests/test_vocoder_rwd.py @@ -5,14 +5,12 @@ from TTS.vocoder.models.random_window_discriminator import RandomWindowDiscrimin def test_rwd(): - layer = RandomWindowDiscriminator(cond_channels=80, - window_sizes=(512, 1024, 2048, 4096, - 8192), - cond_disc_downsample_factors=[ - (8, 4, 2, 2, 2), (8, 4, 2, 2), - (8, 4, 2), (8, 4), (4, 2, 2) - ], - hop_length=256) + layer = RandomWindowDiscriminator( + cond_channels=80, + window_sizes=(512, 1024, 2048, 4096, 8192), + cond_disc_downsample_factors=[(8, 4, 2, 2, 2), (8, 4, 2, 2), (8, 4, 2), (8, 4), (4, 2, 2)], + hop_length=256, + ) x = torch.rand([4, 1, 22050]) c = torch.rand([4, 80, 22050 // 256]) diff --git a/tests/test_vocoder_tf_pqmf.py b/tests/test_vocoder_tf_pqmf.py index 28aebe5b..f1c3666b 100644 --- a/tests/test_vocoder_tf_pqmf.py +++ b/tests/test_vocoder_tf_pqmf.py @@ -24,5 +24,4 @@ def test_pqmf(): print(w2_.max()) print(w2_.min()) print(w2_.mean()) - sf.write(os.path.join(get_tests_output_path(), 'tf_pqmf_output.wav'), - w2_.flatten(), sr) + sf.write(os.path.join(get_tests_output_path(), "tf_pqmf_output.wav"), w2_.flatten(), sr) diff --git a/tests/test_vocoder_wavernn_datasets.py b/tests/test_vocoder_wavernn_datasets.py index 6a2a3339..7bd4380b 100644 --- a/tests/test_vocoder_wavernn_datasets.py +++ b/tests/test_vocoder_wavernn_datasets.py @@ -14,8 +14,7 @@ file_path = os.path.dirname(os.path.realpath(__file__)) OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") os.makedirs(OUTPATH, exist_ok=True) -C = load_config(os.path.join(get_tests_input_path(), - "test_vocoder_wavernn_config.json")) +C = load_config(os.path.join(get_tests_input_path(), "test_vocoder_wavernn_config.json")) test_data_path = os.path.join(get_tests_path(), "data/ljspeech/") test_mel_feat_path = os.path.join(test_data_path, "mel") @@ -33,25 +32,20 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor C.data_path = test_data_path preprocess_wav_files(test_data_path, C, ap) - _, train_items = load_wav_feat_data( - test_data_path, test_mel_feat_path, 5) + _, train_items = load_wav_feat_data(test_data_path, test_mel_feat_path, 5) - dataset = WaveRNNDataset(ap=ap, - items=train_items, - seq_len=seq_len, - hop_len=hop_len, - pad=pad, - mode=mode, - mulaw=mulaw - ) + dataset = WaveRNNDataset( + ap=ap, items=train_items, seq_len=seq_len, hop_len=hop_len, pad=pad, mode=mode, mulaw=mulaw + ) # sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader(dataset, - shuffle=True, - collate_fn=dataset.collate, - batch_size=batch_size, - num_workers=num_workers, - pin_memory=True, - ) + loader = DataLoader( + dataset, + shuffle=True, + collate_fn=dataset.collate, + batch_size=batch_size, + num_workers=num_workers, + pin_memory=True, + ) max_iter = 10 count_iter = 0 @@ -59,10 +53,8 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor try: for data in loader: x_input, mels, _ = data - expected_feat_shape = (ap.num_mels, - (x_input.shape[-1] // hop_len) + (pad * 2)) - assert np.all( - mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}" + expected_feat_shape = (ap.num_mels, (x_input.shape[-1] // hop_len) + (pad * 2)) + assert np.all(mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}" assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1] count_iter += 1 @@ -77,15 +69,15 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_wor def test_parametrized_wavernn_dataset(): - ''' test dataloader with different parameters ''' + """ test dataloader with different parameters """ params = [ - [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 10, True, 0], - [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, "mold", False, 4], - [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 9, False, 0], - [1, C.audio['hop_length'], C.audio['hop_length'], 2, 10, True, 0], - [1, C.audio['hop_length'], C.audio['hop_length'], 2, "mold", False, 0], - [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 4, 10, False, 2], - [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 2, "mold", False, 0], + [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 10, True, 0], + [16, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, "mold", False, 4], + [1, C.audio["hop_length"] * 10, C.audio["hop_length"], 2, 9, False, 0], + [1, C.audio["hop_length"], C.audio["hop_length"], 2, 10, True, 0], + [1, C.audio["hop_length"], C.audio["hop_length"], 2, "mold", False, 0], + [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 4, 10, False, 2], + [1, C.audio["hop_length"] * 5, C.audio["hop_length"], 2, "mold", False, 0], ] for param in params: print(param) diff --git a/tests/test_wavegrad_layers.py b/tests/test_wavegrad_layers.py index 6052e922..0180eb0a 100644 --- a/tests/test_wavegrad_layers.py +++ b/tests/test_wavegrad_layers.py @@ -75,12 +75,12 @@ def test_wavegrad_forward(): c = torch.rand(32, 80, 20) noise_scale = torch.rand(32) - model = Wavegrad(in_channels=80, - out_channels=1, - upsample_factors=[5, 5, 3, 2, 2], - upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], - [1, 2, 4, 8], [1, 2, 4, 8], - [1, 2, 4, 8]]) + model = Wavegrad( + in_channels=80, + out_channels=1, + upsample_factors=[5, 5, 3, 2, 2], + upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], + ) o = model.forward(x, c, noise_scale) assert o.shape[0] == 32 diff --git a/tests/test_wavegrad_train.py b/tests/test_wavegrad_train.py index 6c950c5a..a28409e5 100644 --- a/tests/test_wavegrad_train.py +++ b/tests/test_wavegrad_train.py @@ -6,7 +6,7 @@ from torch import optim from TTS.vocoder.models.wavegrad import Wavegrad -#pylint: disable=unused-variable +# pylint: disable=unused-variable torch.manual_seed(1) use_cuda = torch.cuda.is_available() @@ -20,19 +20,19 @@ class WavegradTrainTest(unittest.TestCase): mel_spec = torch.rand(8, 80, 20).to(device) criterion = torch.nn.L1Loss().to(device) - model = Wavegrad(in_channels=80, - out_channels=1, - upsample_factors=[5, 5, 3, 2, 2], - upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], - [1, 2, 4, 8], [1, 2, 4, 8], - [1, 2, 4, 8]]) + model = Wavegrad( + in_channels=80, + out_channels=1, + upsample_factors=[5, 5, 3, 2, 2], + upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], + ) - model_ref = Wavegrad(in_channels=80, - out_channels=1, - upsample_factors=[5, 5, 3, 2, 2], - upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], - [1, 2, 4, 8], [1, 2, 4, 8], - [1, 2, 4, 8]]) + model_ref = Wavegrad( + in_channels=80, + out_channels=1, + upsample_factors=[5, 5, 3, 2, 2], + upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], + ) model.train() model.to(device) betas = np.linspace(1e-6, 1e-2, 1000) @@ -40,8 +40,7 @@ class WavegradTrainTest(unittest.TestCase): model_ref.load_state_dict(model.state_dict()) model_ref.to(device) count = 0 - for param, param_ref in zip(model.parameters(), - model_ref.parameters()): + for param, param_ref in zip(model.parameters(), model_ref.parameters()): assert (param - param_ref).sum() == 0, param count += 1 optimizer = optim.Adam(model.parameters(), lr=0.001) @@ -53,11 +52,10 @@ class WavegradTrainTest(unittest.TestCase): optimizer.step() # check parameter changes count = 0 - for param, param_ref in zip(model.parameters(), - model_ref.parameters()): + for param, param_ref in zip(model.parameters(), model_ref.parameters()): # ignore pre-higway layer since it works conditional # if count not in [145, 59]: - assert (param != param_ref).any( - ), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref) + assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref + ) count += 1