diff --git a/TTS/VERSION b/TTS/VERSION index ceddfb28..e3b86dd9 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.0.15 +0.0.16 diff --git a/TTS/__init__.py b/TTS/__init__.py index da35faf8..5162d4ec 100644 --- a/TTS/__init__.py +++ b/TTS/__init__.py @@ -1,6 +1,5 @@ import os - with open(os.path.join(os.path.dirname(__file__), "VERSION")) as f: version = f.read().strip() diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 4eb79d76..a0551484 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -299,5 +299,6 @@ if __name__ == "__main__": args = parser.parse_args() c = load_config(args.config_path) - c.audio['do_trim_silence'] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel + c.audio["do_trim_silence"] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel + main(args) diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index 4690e76f..a501a880 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -97,6 +97,8 @@ class BaseTTSConfig(BaseTrainingConfig): Audio processor config object instance. use_phonemes (bool): enable / disable phoneme use. + use_espeak_phonemes (bool): + enable / disable eSpeak-compatible phonemes (only if use_phonemes = `True`). compute_input_seq_cache (bool): enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of the training, It allows faster data loader time and precise limitation with `max_seq_len` and @@ -136,6 +138,7 @@ class BaseTTSConfig(BaseTrainingConfig): audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) # phoneme settings use_phonemes: bool = False + use_espeak_phonemes: bool = True phoneme_language: str = None compute_input_seq_cache: bool = False text_cleaner: str = MISSING diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 9f417a1d..0ddf7ebe 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -25,6 +25,7 @@ def text_to_seqvec(text, CONFIG): CONFIG.enable_eos_bos_chars, tp=CONFIG.characters, add_blank=CONFIG.add_blank, + use_espeak_phonemes=CONFIG.use_espeak_phonemes, ), dtype=np.int32, ) diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index f9f44167..787394b5 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -1,7 +1,9 @@ # -*- coding: utf-8 -*- import re +import unicodedata +import gruut from packaging import version from TTS.tts.utils.text import cleaners @@ -25,8 +27,11 @@ _CURLY_RE = re.compile(r"(.*?)\{(.+?)\}(.*)") # Regular expression matching punctuations, ignoring empty space PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+" +# Table for str.translate to fix gruut/TTS phoneme mismatch +GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ") -def text2phone(text, language): + +def text2phone(text, language, use_espeak_phonemes=False): """Convert graphemes to phonemes. Parameters: text (str): text to phonemize @@ -39,10 +44,43 @@ def text2phone(text, language): # TO REVIEW : How to have a good implementation for this? if language == "zh-CN": ph = chinese_text_to_phonemes(text) + print(" > Phonemes: {}".format(ph)) return ph if language == "ja-jp": ph = japanese_text_to_phonemes(text) + print(" > Phonemes: {}".format(ph)) + return ph + + if gruut.is_language_supported(language): + # Use gruut for phonemization + phonemizer_args = { + "remove_stress": True, + "ipa_minor_breaks": False, # don't replace commas/semi-colons with IPA | + "ipa_major_breaks": False, # don't replace periods with IPA ‖ + } + + if use_espeak_phonemes: + # Use a lexicon/g2p model train on eSpeak IPA instead of gruut IPA. + # This is intended for backwards compatibility with TTS<=v0.0.13 + # pre-trained models. + phonemizer_args["model_prefix"] = "espeak" + + ph_list = gruut.text_to_phonemes( + text, + lang=language, + return_format="word_phonemes", + phonemizer_args=phonemizer_args, + ) + + # Join and re-split to break apart dipthongs, suprasegmentals, etc. + ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list] + ph = "| ".join(ph_words) + + # Fix a few phonemes + ph = ph.translate(GRUUT_TRANS_TABLE) + + print(" > Phonemes: {}".format(ph)) return ph raise ValueError(f" [!] Language {language} is not supported for phonemization.") @@ -66,7 +104,9 @@ def pad_with_eos_bos(phoneme_sequence, tp=None): return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]] -def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False): +def phoneme_to_sequence( + text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False, use_espeak_phonemes=False +): # pylint: disable=global-statement global _phonemes_to_id, _phonemes if tp: @@ -75,7 +115,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp= sequence = [] clean_text = _clean_text(text, cleaner_names) - to_phonemes = text2phone(clean_text, language) + to_phonemes = text2phone(clean_text, language, use_espeak_phonemes=use_espeak_phonemes) if to_phonemes is None: print("!! After phoneme conversion the result is None. -- {} ".format(clean_text)) # iterate by skipping empty strings - NOTE: might be useful to keep it to have a better intonation. @@ -86,6 +126,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp= sequence = pad_with_eos_bos(sequence, tp=tp) if add_blank: sequence = intersperse(sequence, len(_phonemes)) # add a blank token (new), whose id number is len(_phonemes) + return sequence diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index cf7df7de..f5165079 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -102,10 +102,10 @@ class ModelManager(object): output_model_path = os.path.join(output_path, "model_file.pth.tar") output_config_path = os.path.join(output_path, "config.json") # NOTE : band-aid for removing phoneme support - if "needs_phonemizer" in model_item and model_item["needs_phonemizer"]: - raise RuntimeError( - " [!] Use 🐸TTS <= v0.0.13 for this model. Current version does not support phoneme based models." - ) + # if "needs_phonemizer" in model_item and model_item["needs_phonemizer"]: + # raise RuntimeError( + # " [!] Use 🐸TTS <= v0.0.13 for this model. Current version does not support phoneme based models." + # ) if os.path.exists(output_path): print(f" > {model_name} is already downloaded.") else: diff --git a/requirements.txt b/requirements.txt index fde48978..046139d0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,3 +22,5 @@ coqpit # japanese g2p deps mecab-python3==1.0.3 unidic-lite==1.0.8 +# gruut+supported langs +gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0 diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py new file mode 100644 index 00000000..3c424a15 --- /dev/null +++ b/tests/test_text_processing.py @@ -0,0 +1,104 @@ +"""Tests for text to phoneme converstion""" +import unittest + +from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme, text2phone + +# ----------------------------------------------------------------------------- + +LANG = "en-us" + +EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" + +EXPECTED_PHONEMES = "ɹ|iː|s|ə|n|t| ɹ|ᵻ|s|ɜː|tʃ| æ|t| h|ɑːɹ|v|ɚ|d| h|æ|z| ʃ|oʊ|n| m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| f|ɔːɹ| æ|z| l|ɪ|ɾ|əl| æ|z| eɪ|t| w|iː|k|s| k|æ|n| æ|k|tʃ|uː|əl|i| ɪ|ŋ|k|ɹ|iː|s| ,| ð|ə| ɡ|ɹ|eɪ| m|æ|ɾ|ɚ| ɪ|n| ð|ə| p|ɑːɹ|t|s| ʌ|v| ð|ə| b|ɹ|eɪ|n| ɹ|ᵻ|s|p|ɑː|n|s|ᵻ|b|əl| f|ɔːɹ| ɪ|m|oʊ|ʃ|ə|n|əl| ɹ|ɛ|ɡ|j|ʊ|l|eɪ|ʃ|ə|n| æ|n|d| l|ɜː|n|ɪ|ŋ| !" + +# ----------------------------------------------------------------------------- + + +class TextProcessingTextCase(unittest.TestCase): + """Tests for text to phoneme conversion""" + + def test_phoneme_to_sequence(self): + """Verify en-us sentence phonemes without blank token""" + self._test_phoneme_to_sequence(add_blank=False) + + def test_phoneme_to_sequence_with_blank_token(self): + """Verify en-us sentence phonemes with blank token""" + self._test_phoneme_to_sequence(add_blank=True) + + def _test_phoneme_to_sequence(self, add_blank): + """Verify en-us sentence phonemes""" + text_cleaner = ["phoneme_cleaners"] + sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) + text_hat = sequence_to_phoneme(sequence) + text_hat_with_params = sequence_to_phoneme(sequence) + gt = EXPECTED_PHONEMES.replace("|", "") + self.assertEqual(text_hat, text_hat_with_params) + self.assertEqual(text_hat, gt) + + # multiple punctuations + text = "Be a voice, not an! echo?" + sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) + text_hat = sequence_to_phoneme(sequence) + text_hat_with_params = sequence_to_phoneme(sequence) + gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ?" + print(text_hat) + print(len(sequence)) + self.assertEqual(text_hat, text_hat_with_params) + self.assertEqual(text_hat, gt) + + # not ending with punctuation + text = "Be a voice, not an! echo" + sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) + text_hat = sequence_to_phoneme(sequence) + text_hat_with_params = sequence_to_phoneme(sequence) + gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ" + print(text_hat) + print(len(sequence)) + self.assertEqual(text_hat, text_hat_with_params) + self.assertEqual(text_hat, gt) + + # original + text = "Be a voice, not an echo!" + sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) + text_hat = sequence_to_phoneme(sequence) + text_hat_with_params = sequence_to_phoneme(sequence) + gt = "biː ɐ vɔɪs , nɑːt ɐn ɛkoʊ !" + print(text_hat) + print(len(sequence)) + self.assertEqual(text_hat, text_hat_with_params) + self.assertEqual(text_hat, gt) + + # extra space after the sentence + text = "Be a voice, not an! echo. " + sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) + text_hat = sequence_to_phoneme(sequence) + text_hat_with_params = sequence_to_phoneme(sequence) + gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ." + print(text_hat) + print(len(sequence)) + self.assertEqual(text_hat, text_hat_with_params) + self.assertEqual(text_hat, gt) + + # extra space after the sentence + text = "Be a voice, not an! echo. " + sequence = phoneme_to_sequence( + text, text_cleaner, LANG, enable_eos_bos=True, add_blank=add_blank, use_espeak_phonemes=True + ) + text_hat = sequence_to_phoneme(sequence) + text_hat_with_params = sequence_to_phoneme(sequence) + gt = "^biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ .~" + print(text_hat) + print(len(sequence)) + self.assertEqual(text_hat, text_hat_with_params) + self.assertEqual(text_hat, gt) + + def test_text2phone(self): + """Verify phones directly (with |)""" + ph = text2phone(EXAMPLE_TEXT, LANG, use_espeak_phonemes=True) + self.assertEqual(ph, EXPECTED_PHONEMES) + + +# ----------------------------------------------------------------------------- + +if __name__ == "__main__": + unittest.main() diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py index 2e675d13..e44f6365 100644 --- a/tests/tts_tests/test_glow_tts_train.py +++ b/tests/tts_tests/test_glow_tts_train.py @@ -16,7 +16,8 @@ config = GlowTTSConfig( num_val_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=True, - phoneme_language="zh-CN", + use_espeak_phonemes=True, + phoneme_language="en-us", phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", run_eval=True, test_delay_epochs=-1, diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index 3f508117..9dcf0ad8 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -16,7 +16,7 @@ config = SpeedySpeechConfig( num_val_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=True, - phoneme_language="zh-CN", + phoneme_language="en-us", phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", run_eval=True, test_delay_epochs=-1, diff --git a/tests/vocoder_tests/test_multiband_melgan_train.py b/tests/vocoder_tests/test_multiband_melgan_train.py index 081fb40e..ef362414 100644 --- a/tests/vocoder_tests/test_multiband_melgan_train.py +++ b/tests/vocoder_tests/test_multiband_melgan_train.py @@ -20,6 +20,7 @@ config = MultibandMelganConfig( eval_split_size=1, print_step=1, print_eval=True, + discriminator_model_params={"base_channels": 16, "max_channels": 128, "downsample_factors": [4, 4, 4]}, data_path="tests/data/ljspeech", output_path=output_path, )