mirror of
https://github.com/modelscope/modelscope.git
synced 2026-02-24 20:19:51 +01:00
[to #42322933] fix preprocessor concurrency
使用非 fast 版本的 tokenizer 避免并发环境出现 RuntimeError: Already borrowed 的报错
This commit is contained in:
@@ -139,13 +139,13 @@ class NLPTokenizerPreprocessorBase(Preprocessor):
|
||||
def build_tokenizer(self, model_dir):
|
||||
model_type = get_model_type(model_dir)
|
||||
if model_type in (Models.structbert, Models.gpt3, Models.palm):
|
||||
from modelscope.models.nlp.structbert import SbertTokenizerFast
|
||||
return SbertTokenizerFast.from_pretrained(model_dir)
|
||||
from modelscope.models.nlp.structbert import SbertTokenizer
|
||||
return SbertTokenizer.from_pretrained(model_dir, use_fast=False)
|
||||
elif model_type == Models.veco:
|
||||
from modelscope.models.nlp.veco import VecoTokenizerFast
|
||||
return VecoTokenizerFast.from_pretrained(model_dir)
|
||||
from modelscope.models.nlp.veco import VecoTokenizer
|
||||
return VecoTokenizer.from_pretrained(model_dir)
|
||||
else:
|
||||
return AutoTokenizer.from_pretrained(model_dir)
|
||||
return AutoTokenizer.from_pretrained(model_dir, use_fast=False)
|
||||
|
||||
def __call__(self, data: Union[str, Tuple, Dict]) -> Dict[str, Any]:
|
||||
"""process the raw input data
|
||||
@@ -468,7 +468,7 @@ class NERPreprocessor(Preprocessor):
|
||||
self.model_dir: str = model_dir
|
||||
self.sequence_length = kwargs.pop('sequence_length', 512)
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_dir, use_fast=True)
|
||||
model_dir, use_fast=False)
|
||||
self.is_split_into_words = self.tokenizer.init_kwargs.get(
|
||||
'is_split_into_words', False)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user