[to #42322933]fix tokenizer for faq

多语言faq,Tokenizer新增类型判别
        Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10530690
This commit is contained in:
wenshen.xws
2022-10-26 16:18:27 +08:00
committed by yingda.chen
parent 3b8fb92c13
commit 2c994ed760

View File

@@ -18,11 +18,19 @@ class FaqQuestionAnsweringPreprocessor(NLPBasePreprocessor):
def __init__(self, model_dir: str, *args, **kwargs):
super(FaqQuestionAnsweringPreprocessor, self).__init__(
model_dir, mode=ModeKeys.INFERENCE, **kwargs)
from transformers import BertTokenizer
self.tokenizer = BertTokenizer.from_pretrained(model_dir)
preprocessor_config = Config.from_file(
os.path.join(model_dir, ModelFile.CONFIGURATION)).get(
ConfigFields.preprocessor, {})
if preprocessor_config.get('tokenizer',
'BertTokenizer') == 'XLMRoberta':
from transformers import XLMRobertaTokenizer
self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_dir)
else:
self.tokenizer = BertTokenizer.from_pretrained(model_dir)
self.MAX_LEN = preprocessor_config.get('max_seq_length', 50)
self.label_dict = None