From 9c88fb22a480398c171a03803e48c2794dcd28e2 Mon Sep 17 00:00:00 2001
From: "hemu.zp" <hemu.zp@alibaba-inc.com>
Date: Mon, 13 Feb 2023 09:50:48 +0000
Subject: [PATCH] [to #42322933]  fix gpt3 unexpected spaces

---
 modelscope/utils/chinese_utils.py | 36 +++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/modelscope/utils/chinese_utils.py b/modelscope/utils/chinese_utils.py
index 793c2050..86cf91a2 100644
--- a/modelscope/utils/chinese_utils.py
+++ b/modelscope/utils/chinese_utils.py
@@ -9,21 +9,12 @@ CHINESE_PUNCTUATION = '＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠
 ENGLISH_PUNCTUATION = string.punctuation
 
 
-def is_chinese_char(word: str):
-    chinese_punctuations = {
-        '，', '。', '；', '：'
-        '！', '？', '《', '》', '‘', '’', '“', '”', '（', '）', '【', '】'
-    }
-    return len(word) == 1 \
-        and ('\u4e00' <= word <= '\u9fa5' or word in chinese_punctuations)
-
-
 def remove_space_between_chinese_chars(decoded_str: str):
     old_word_list = decoded_str.split(' ')
     new_word_list = []
     start = -1
     for i, word in enumerate(old_word_list):
-        if is_chinese_char(word):
+        if _is_chinese_str(word):
             if start == -1:
                 start = i
         else:
@@ -39,10 +30,33 @@ def remove_space_between_chinese_chars(decoded_str: str):
 # add space for each chinese char
 def rebuild_chinese_str(string: str):
     return ' '.join(''.join([
-        f' {char} ' if is_chinese_char(char) else char for char in string
+        f' {char} '
+        if _is_chinese_char(char) or char in CHINESE_PUNCTUATION else char
+        for char in string
     ]).split())
 
 
+def _is_chinese_str(string: str) -> bool:
+    return all(
+        _is_chinese_char(cp) or cp in CHINESE_PUNCTUATION
+        or cp in ENGLISH_PUNCTUATION or cp for cp in string)
+
+
+def _is_chinese_char(cp: str) -> bool:
+    """Checks whether CP is the codepoint of a CJK character."""
+    cp = ord(cp)
+    if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF)
+            or (cp >= 0x20000 and cp <= 0x2A6DF)
+            or (cp >= 0x2A700 and cp <= 0x2B73F)
+            or (cp >= 0x2B740 and cp <= 0x2B81F)
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)):
+        return True
+
+    return False
+
+
 def normalize_chinese_number(text):
     chinese_number = ['零', '一', '二', '三', '四', '五', '六', '七', '八', '九']
     new_text = ''