From 848d4d6c0576dbf766ff2c199571fd4e37131afe Mon Sep 17 00:00:00 2001 From: "xiangpeng.wxp" Date: Tue, 7 Feb 2023 07:39:11 +0000 Subject: [PATCH] [to #42322933] fix discrepancy between train and inference for es/fr Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11570542 * [to #42322933] fix discrepancy between train and inference for es/fr --- modelscope/pipelines/nlp/translation_pipeline.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/modelscope/pipelines/nlp/translation_pipeline.py b/modelscope/pipelines/nlp/translation_pipeline.py index 2f90be2a..a12c0011 100644 --- a/modelscope/pipelines/nlp/translation_pipeline.py +++ b/modelscope/pipelines/nlp/translation_pipeline.py @@ -101,9 +101,15 @@ class TranslationPipeline(Pipeline): input_tok = [' '.join(list(item)) for item in input_tok] else: input = [self._punct_normalizer.normalize(item) for item in input] + aggressive_dash_splits = True + if (self._src_lang in ['es', 'fr'] and self._tgt_lang == 'en') or ( + self._src_lang == 'en' and self._tgt_lang in ['es', 'fr']): + aggressive_dash_splits = False input_tok = [ self._tok.tokenize( - item, return_str=True, aggressive_dash_splits=True) + item, + return_str=True, + aggressive_dash_splits=aggressive_dash_splits) for item in input ]