mirror of
https://github.com/voice-cloning-app/Voice-Cloning-App.git
synced 2026-02-24 20:20:22 +01:00
110 lines
3.4 KiB
Python
110 lines
3.4 KiB
Python
import argparse
|
|
import re
|
|
|
|
import inflect
|
|
from unidecode import unidecode
|
|
|
|
INFLECT_ENGINE = inflect.engine()
|
|
COMMA_NUMBER_RE = re.compile(r"([0-9][0-9\,]+[0-9])")
|
|
DECIMAL_NUMBER_RE = re.compile(r"([0-9]+\.[0-9]+)")
|
|
NUMBER_RE = re.compile(r"[0-9]+")
|
|
ORDINALS = re.compile(r"([0-9]+[st|nd|rd|th]+)")
|
|
CURRENCY = re.compile(r"([£|$|€]+[0-9]+)")
|
|
WHITESPACE_RE = re.compile(r"\s+")
|
|
ALLOWED_CHARACTERS_RE = re.compile("[^a-z ,.!?'-]+")
|
|
MONETARY_REPLACEMENT = {"$": " dollars", "£": " pounds", "€": " euros"}
|
|
ABBREVIATION_REPLACEMENT = {
|
|
"mr.": "mister",
|
|
"mrs.": "misess",
|
|
"dr.": "doctor",
|
|
"no.": "number",
|
|
"st.": "saint",
|
|
"co.": "company",
|
|
"jr.": "junior",
|
|
"maj.": "major",
|
|
"gen.": "general",
|
|
"drs.": "doctors",
|
|
"rev.": "reverend",
|
|
"lt.": "lieutenant",
|
|
"hon.": "honorable",
|
|
"sgt.": "sergeant",
|
|
"capt.": "captain",
|
|
"esq.": "esquire",
|
|
"ltd.": "limited",
|
|
"col.": "colonel",
|
|
"ft.": "fort",
|
|
}
|
|
|
|
|
|
def clean_text(text):
|
|
"""
|
|
Cleans text. This includes:
|
|
- Replacing monetary terms (i.e. $ -> dollars)
|
|
- Converting ordinals to full words (i.e. 1st -> first)
|
|
- Converting numbers to their full word format (i.e. 100 -> one hundred)
|
|
- Replacing abbreviations (i.e. dr. -> doctor)
|
|
- Removing invalid characters (non utf-8 or invalid punctuation)
|
|
|
|
Parameters
|
|
----------
|
|
text : str
|
|
Text to clean
|
|
|
|
Returns
|
|
-------
|
|
str
|
|
Cleaned text
|
|
"""
|
|
text = unidecode(text)
|
|
text = text.strip()
|
|
text = text.lower()
|
|
# Convert currency to words
|
|
money = re.findall(CURRENCY, text)
|
|
for amount in money:
|
|
for key, value in MONETARY_REPLACEMENT.items():
|
|
if key in amount:
|
|
text = text.replace(amount, amount[1:] + value)
|
|
# Convert ordinals to words
|
|
ordinals = re.findall(ORDINALS, text)
|
|
for ordinal in ordinals:
|
|
text = text.replace(ordinal, INFLECT_ENGINE.number_to_words(ordinal))
|
|
# Convert comma & decimal numbers to words
|
|
numbers = re.findall(COMMA_NUMBER_RE, text) + re.findall(DECIMAL_NUMBER_RE, text)
|
|
for number in numbers:
|
|
text = text.replace(number, INFLECT_ENGINE.number_to_words(number))
|
|
# Convert standard numbers to words
|
|
numbers = re.findall(NUMBER_RE, text)
|
|
for number in numbers:
|
|
text = text.replace(number, INFLECT_ENGINE.number_to_words(number))
|
|
# Replace abbreviations
|
|
for key, value in ABBREVIATION_REPLACEMENT.items():
|
|
text = text.replace(" " + key + " ", " " + value + " ")
|
|
# Collapse whitespace
|
|
text = re.sub(WHITESPACE_RE, " ", text)
|
|
# Remove banned characters
|
|
text = re.sub(ALLOWED_CHARACTERS_RE, "", text)
|
|
return text
|
|
|
|
|
|
if __name__ == "__main__":
|
|
"""Script to clean text for training"""
|
|
parser = argparse.ArgumentParser(description="Clean & improve text for training")
|
|
parser.add_argument("-f", "--file", help="Text file path", type=str, required=True)
|
|
parser.add_argument("-o", "--output", help="Output text file path", type=str, required=True)
|
|
args = parser.parse_args()
|
|
|
|
with open(args.file) as f:
|
|
rows = f.readlines()
|
|
|
|
cleaned_text = []
|
|
|
|
for row in rows:
|
|
filename, text = row.split("|")
|
|
text = clean_text(text)
|
|
cleaned_text.append(f"{filename}|{text}")
|
|
|
|
with open(args.output, "w") as f:
|
|
for line in cleaned_text:
|
|
f.write(line)
|
|
f.write("\n")
|