Files
TTS/tests/tts_tests2/test_feed_forward_layers.py
logan hart 6fdb88f8e2 Add Delightful-TTS implementation (#2095)
* add configs

* Update config file

* Add model configs

* Add model layers

* Add layer files

* Add layer modules

* change config names

* Add emotion manager

* fIX missing ap bug

* Fix missing ap bug

* Add base TTS e2e class

* Fix wrong variable name in load_tts_samples

* Add training script

* Remove range predictor and gaussian upsampling

* Add helper function

* Add vctk recipe

* Add conformer docs

* Fix linting in conformer.py

* Add Docs

* remove duplicate import

* refactor args

* Fix bugs

* Removew emotion embedding

* remove unused arg

* Remove emotion embedding arg

* Remove emotion embedding arg

* fix style issues

* Fix bugs

* Fix bugs

* Add unittests

* make style

* fix formatter bug

* fix test

* Add pyworld compute pitch func

* Update requirments.txt

* Fix dataset Bug

* Chnge layer norm to instance norm

* Add missing import

* Remove emotions.py

* remove ssim loss

* Add init layers func to aligner

* refactor model layers

* remove audio_config arg

* Rename loss func

* Rename to delightful-tts

* Rename loss func

* Remove unused modules

* refactor imports

* replace audio config with audio processor

* Add change sample rate option

* remove broken resample func

* update recipe

* fix style, add config docs

* fix tests and multispeaker embd dim

* remove pyworld

* Make style and fix inference

* Split tts tests

* Fixup

* Fixup

* Fixup

* Add argument names

* Set "random" speaker in the model Tortoise/Bark

* Use a diff f0_cache path for delightfull tts

* Fix delightful speaker handling

* Fix lint

* Make style

---------

Co-authored-by: loganhart420 <loganartpersonal@gmail.com>
Co-authored-by: Eren Gölge <erogol@hotmail.com>
2023-07-24 13:41:26 +02:00

108 lines
3.5 KiB
Python

import torch
from TTS.tts.layers.feed_forward.decoder import Decoder
from TTS.tts.layers.feed_forward.encoder import Encoder
from TTS.tts.utils.helpers import sequence_mask
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def test_encoder():
input_dummy = torch.rand(8, 14, 37).to(device)
input_lengths = torch.randint(31, 37, (8,)).long().to(device)
input_lengths[-1] = 37
input_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
# relative positional transformer encoder
layer = Encoder(
out_channels=11,
in_hidden_channels=14,
encoder_type="relative_position_transformer",
encoder_params={
"hidden_channels_ffn": 768,
"num_heads": 2,
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 6,
"rel_attn_window_size": 4,
"input_length": None,
},
).to(device)
output = layer(input_dummy, input_mask)
assert list(output.shape) == [8, 11, 37]
# residual conv bn encoder
layer = Encoder(
out_channels=11,
in_hidden_channels=14,
encoder_type="residual_conv_bn",
encoder_params={"kernel_size": 4, "dilations": 4 * [1, 2, 4] + [1], "num_conv_blocks": 2, "num_res_blocks": 13},
).to(device)
output = layer(input_dummy, input_mask)
assert list(output.shape) == [8, 11, 37]
# FFTransformer encoder
layer = Encoder(
out_channels=14,
in_hidden_channels=14,
encoder_type="fftransformer",
encoder_params={"hidden_channels_ffn": 31, "num_heads": 2, "num_layers": 2, "dropout_p": 0.1},
).to(device)
output = layer(input_dummy, input_mask)
assert list(output.shape) == [8, 14, 37]
def test_decoder():
input_dummy = torch.rand(8, 128, 37).to(device)
input_lengths = torch.randint(31, 37, (8,)).long().to(device)
input_lengths[-1] = 37
input_mask = torch.unsqueeze(sequence_mask(input_lengths, input_dummy.size(2)), 1).to(device)
# residual bn conv decoder
layer = Decoder(out_channels=11, in_hidden_channels=128).to(device)
output = layer(input_dummy, input_mask)
assert list(output.shape) == [8, 11, 37]
# transformer decoder
layer = Decoder(
out_channels=11,
in_hidden_channels=128,
decoder_type="relative_position_transformer",
decoder_params={
"hidden_channels_ffn": 128,
"num_heads": 2,
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 8,
"rel_attn_window_size": 4,
"input_length": None,
},
).to(device)
output = layer(input_dummy, input_mask)
assert list(output.shape) == [8, 11, 37]
# wavenet decoder
layer = Decoder(
out_channels=11,
in_hidden_channels=128,
decoder_type="wavenet",
decoder_params={
"num_blocks": 12,
"hidden_channels": 192,
"kernel_size": 5,
"dilation_rate": 1,
"num_layers": 4,
"dropout_p": 0.05,
},
).to(device)
output = layer(input_dummy, input_mask)
# FFTransformer decoder
layer = Decoder(
out_channels=11,
in_hidden_channels=128,
decoder_type="fftransformer",
decoder_params={
"hidden_channels_ffn": 31,
"num_heads": 2,
"dropout_p": 0.1,
"num_layers": 2,
},
).to(device)
output = layer(input_dummy, input_mask)
assert list(output.shape) == [8, 11, 37]