add chinese model

This commit is contained in:
wl-zhao
2023-12-23 20:38:52 +08:00
parent 5ec54a9e68
commit bc1d992f4f
6 changed files with 516 additions and 23 deletions

View File

@@ -75,6 +75,13 @@ def bits_to_string(bits_array):
return output_string
def split_sentence(text, min_len=10, language_str='[EN]'):
if language_str in ['EN']:
sentences = split_sentences_latin(text, min_len=min_len)
else:
sentences = split_sentences_zh(text, min_len=min_len)
return sentences
def split_sentences_latin(text, min_len=10):
"""Split Long sentences into list of short ones
@@ -133,4 +140,55 @@ def merge_short_sentences_latin(sens):
sens_out.pop(-1)
except:
pass
return sens_out
def split_sentences_zh(text, min_len=10):
text = re.sub('[。!?;]', '.', text)
text = re.sub('[]', ',', text)
# 将文本中的换行符、空格和制表符替换为空格
text = re.sub('[\n\t ]+', ' ', text)
# 在标点符号后添加一个空格
text = re.sub('([,.!?;])', r'\1 $#!', text)
# 分隔句子并去除前后空格
# sentences = [s.strip() for s in re.split('(。|||)', text)]
sentences = [s.strip() for s in text.split('$#!')]
if len(sentences[-1]) == 0: del sentences[-1]
new_sentences = []
new_sent = []
count_len = 0
for ind, sent in enumerate(sentences):
new_sent.append(sent)
count_len += len(sent)
if count_len > min_len or ind == len(sentences) - 1:
count_len = 0
new_sentences.append(' '.join(new_sent))
new_sent = []
return merge_short_sentences_zh(new_sentences)
def merge_short_sentences_zh(sens):
# return sens
"""Avoid short sentences by merging them with the following sentence.
Args:
List[str]: list of input sentences.
Returns:
List[str]: list of output sentences.
"""
sens_out = []
for s in sens:
# If the previous sentense is too short, merge them with
# the current sentence.
if len(sens_out) > 0 and len(sens_out[-1]) <= 2:
sens_out[-1] = sens_out[-1] + " " + s
else:
sens_out.append(s)
try:
if len(sens_out[-1]) <= 2:
sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
sens_out.pop(-1)
except:
pass
return sens_out