diff --git a/TTS/__init__.py b/TTS/__init__.py index da35faf8..5162d4ec 100644 --- a/TTS/__init__.py +++ b/TTS/__init__.py @@ -1,6 +1,5 @@ import os - with open(os.path.join(os.path.dirname(__file__), "VERSION")) as f: version = f.read().strip() diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 49e7a08a..14319c44 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import re +import unicodedata import gruut from packaging import version @@ -26,32 +27,34 @@ _CURLY_RE = re.compile(r"(.*?)\{(.+?)\}(.*)") # Regular expression matching punctuations, ignoring empty space PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+" -# language -> source phoneme -> dest phoneme -# Used to make gruut's phonemes fit better with eSpeak's. -GRUUT_PHONEME_MAP = { - "en-us": { - "i": "iː", - "ɑ": "ɑː", - "ɚ": "ɜːɹ", - }, - "de": { - "ʁ": "ɾ", - "g": "ɡ", - "ʔ": "", - }, - "nl": { - "a": "aː", - "e": "eː", - "ʏ": "ɵ", - "ʋ": "w", - "ɹ": "r", - "ɔː": "oː", - }, - "es": { - "ɾ": "r", - "g": "ɣ", - }, -} +# Table for str.translate to fix gruut/TTS phoneme mismatch +GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ") + + +def clean_gruut_phonemes(ph_list): + """Decompose, substitute, and clean gruut phonemes for TTS. + + Parameters: + ph_list (list[str]): list of phonemes from gruut + + Returns: + clean_list (list[str]): decomposed/clean list of phonemes for TTS + Dipthongs, etc. are decomposed into single characters + Unicode combining characters are removed (e.g., ties) + """ + cleaned_phonemes = [] + + for phoneme_text in ph_list: + # Decompose into codepoints (ã -> ["a", "\u0303"]) + phoneme_text = unicodedata.normalize("NFD", phoneme_text) + for codepoint in phoneme_text.translate(GRUUT_TRANS_TABLE): + if unicodedata.combining(codepoint) > 0: + # Skip combining characters like ties + continue + + cleaned_phonemes.append(codepoint) + + return cleaned_phonemes def text2phone(text, language): @@ -82,21 +85,14 @@ def text2phone(text, language): lang=language, return_format="word_phonemes", phonemizer_args={ - "remove_stress": True, # remove primary/secondary stress + "remove_accents": True, # remove accute/grave accents (Swedish) "ipa_minor_breaks": False, # don't replace commas/semi-colons with IPA | "ipa_major_breaks": False, # don't replace periods with IPA ‖ }, ) - ph_map = GRUUT_PHONEME_MAP.get(language) - if ph_map: - # Re-map phonemes to fit with eSpeak conventions - for word in ph_list: - for p_idx, p in enumerate(word): - word[p_idx] = ph_map.get(p, p) - # Join and re-split to break apart dipthongs, suprasegmentals, etc. - ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list] + ph_words = ["|".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in ph_list] ph = "| ".join(ph_words) print(" > Phonemes: {}".format(ph)) diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py new file mode 100644 index 00000000..f4938ca0 --- /dev/null +++ b/tests/test_text_processing.py @@ -0,0 +1,137 @@ +"""Tests for text to phoneme converstion""" +import unittest + +import gruut +from gruut_ipa import IPA, Phonemes + +from TTS.tts.utils.text import clean_gruut_phonemes, phoneme_to_sequence +from TTS.tts.utils.text import phonemes as all_phonemes +from TTS.tts.utils.text import sequence_to_phoneme + +# ----------------------------------------------------------------------------- + +EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" + +# Raw phonemes from run of gruut with example text (en-us). +# This includes IPA ties, etc. +EXAMPLE_PHONEMES = [ + ["ɹ", "ˈi", "s", "ə", "n", "t"], + ["ɹ", "i", "s", "ˈɚ", "t͡ʃ"], + ["ˈæ", "t"], + ["h", "ˈɑ", "ɹ", "v", "ɚ", "d"], + ["h", "ˈæ", "z"], + ["ʃ", "ˈoʊ", "n"], + ["m", "ˈɛ", "d", "ɪ", "t", "ˌeɪ", "t", "ɪ", "ŋ"], + ["f", "ɚ"], + ["ˈæ", "z"], + ["l", "ˈɪ", "t", "ə", "l"], + ["ˈæ", "z"], + ["ˈeɪ", "t"], + ["w", "ˈi", "k", "s"], + ["k", "ə", "n"], + ["ˈæ", "k", "t͡ʃ", "ə", "l", "i"], + ["ɪ", "ŋ", "k", "ɹ", "ˈi", "s"], + [","], + ["ð", "ə"], + ["ɡ", "ɹ", "ˈeɪ"], + ["m", "ˈæ", "t", "ɚ"], + ["ˈɪ", "n"], + ["ð", "ə"], + ["p", "ˈɑ", "ɹ", "t", "s"], + ["ə", "v"], + ["ð", "ə"], + ["b", "ɹ", "ˈeɪ", "n"], + ["ɹ", "i", "s", "p", "ˈɑ", "n", "s", "ɪ", "b", "ə", "l"], + ["f", "ɚ"], + ["ɪ", "m", "ˈoʊ", "ʃ", "ə", "n", "ə", "l"], + ["ɹ", "ˌɛ", "ɡ", "j", "ə", "l", "ˈeɪ", "ʃ", "ə", "n"], + ["ˈæ", "n", "d"], + ["l", "ˈɚ", "n", "ɪ", "ŋ"], + ["!"], +] + +# ----------------------------------------------------------------------------- + + +class TextProcessingTextCase(unittest.TestCase): + """Tests for text to phoneme conversion""" + + def test_all_phonemes_in_tts(self): + """Ensure that all phonemes from gruut are present in TTS phonemes""" + tts_phonemes = set(all_phonemes) + + # Check stress characters + for suprasegmental in [IPA.STRESS_PRIMARY, IPA.STRESS_SECONDARY]: + self.assertIn(suprasegmental, tts_phonemes) + + # Check that gruut's phonemes are a subset of TTS phonemes + for lang in gruut.get_supported_languages(): + for phoneme in Phonemes.from_language(lang): + for codepoint in clean_gruut_phonemes(phoneme.text): + + self.assertIn(codepoint, tts_phonemes) + + def test_phoneme_to_sequence(self): + """Verify example (text -> sequence -> phoneme string) pipeline""" + lang = "en-us" + expected_phoneme_str = " ".join( + "".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in EXAMPLE_PHONEMES + ) + + # Ensure that TTS produces same phoneme string + text_cleaner = ["phoneme_cleaners"] + actual_sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang) + actual_phoneme_str = sequence_to_phoneme(actual_sequence) + + self.assertEqual(actual_phoneme_str, expected_phoneme_str) + + def test_phoneme_to_sequence_with_blank_token(self): + """Verify example (text -> sequence -> phoneme string) pipeline with blank token""" + lang = "en-us" + text_cleaner = ["phoneme_cleaners"] + + # Create with/without blank sequences + sequence_without_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=False) + sequence_with_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=True) + + # With blank sequence should be bigger + self.assertGreater(len(sequence_with_blank), len(sequence_without_blank)) + + # But phoneme strings should still be identical + phoneme_str_without_blank = sequence_to_phoneme(sequence_without_blank, add_blank=False) + phoneme_str_with_blank = sequence_to_phoneme(sequence_with_blank, add_blank=True) + + self.assertEqual(phoneme_str_with_blank, phoneme_str_without_blank) + + def test_messy_text(self): + """Verify text with extra punctuation/whitespace/etc. makes it through the pipeline""" + text = '"Be" a! voice, [NOT]? (an eCHo. ' + lang = "en-us" + expected_phonemes = [ + ["b", "ˈi"], + ["ə"], + ["!"], + ["v", "ˈɔɪ", "s"], + [","], + ["n", "ˈɑ", "t"], + ["?"], + ["ə", "n"], + ["ˈɛ", "k", "oʊ"], + ["."], + ] + expected_phoneme_str = " ".join( + "".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in expected_phonemes + ) + + # Ensure that TTS produces same phoneme string + text_cleaner = ["phoneme_cleaners"] + actual_sequence = phoneme_to_sequence(text, text_cleaner, lang) + actual_phoneme_str = sequence_to_phoneme(actual_sequence) + + self.assertEqual(actual_phoneme_str, expected_phoneme_str) + + +# ----------------------------------------------------------------------------- + +if __name__ == "__main__": + unittest.main()