From 2ec18b89da26acd7772a4ccf00211b6097f6a252 Mon Sep 17 00:00:00 2001 From: suluyan Date: Fri, 20 Dec 2024 16:46:24 +0800 Subject: [PATCH] fix punkt --- .dev_scripts/ci_container_test.sh | 1 + modelscope/pipelines/nlp/llm_pipeline.py | 3 ++- .../preprocessors/nlp/fill_mask_preprocessor.py | 11 +++++++---- modelscope/utils/streaming_output.py | 6 ++++-- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh index 1782693e..31556192 100644 --- a/.dev_scripts/ci_container_test.sh +++ b/.dev_scripts/ci_container_test.sh @@ -31,6 +31,7 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then python -m spacy download en_core_web_sm pip install faiss-gpu pip install healpy + pip install huggingface-hub==0.25.2 # test with install pip install . else diff --git a/modelscope/pipelines/nlp/llm_pipeline.py b/modelscope/pipelines/nlp/llm_pipeline.py index 2c08c498..3199d7fa 100644 --- a/modelscope/pipelines/nlp/llm_pipeline.py +++ b/modelscope/pipelines/nlp/llm_pipeline.py @@ -269,7 +269,8 @@ class LLMPipeline(Pipeline, PipelineStreamingOutputMixin): assert model_id.lower() in SWIFT_MODEL_ID_MAPPING,\ f'Invalid model id {model_id} or Swift framework does not support this model.' - args = InferArguments(model_type=SWIFT_MODEL_ID_MAPPING[model_id.lower()]) + args = InferArguments( + model_type=SWIFT_MODEL_ID_MAPPING[model_id.lower()]) model, template = prepare_model_template( args, device_map=self.device_map) self.model = add_stream_generate(model) diff --git a/modelscope/preprocessors/nlp/fill_mask_preprocessor.py b/modelscope/preprocessors/nlp/fill_mask_preprocessor.py index f43e03ed..c5113f35 100644 --- a/modelscope/preprocessors/nlp/fill_mask_preprocessor.py +++ b/modelscope/preprocessors/nlp/fill_mask_preprocessor.py @@ -213,11 +213,14 @@ class FillMaskPoNetPreprocessor(FillMaskPreprocessorBase): osp.join(model_dir, ModelFile.CONFIGURATION)) self.language = self.cfg.model.get('language', 'en') if self.language == 'en': - from nltk.tokenize import sent_tokenize import nltk - nltk.download('punkt_tab') - # import_external_nltk_data( - # osp.join(model_dir, 'nltk_data'), 'tokenizers/punkt_tab') + from nltk.tokenize import sent_tokenize + from packaging import version + if version.parse(nltk.__version__) >= version.parse('3.8.2'): + nltk.download('punkt_tab') + else: + import_external_nltk_data( + osp.join(model_dir, 'nltk_data'), 'tokenizers/punkt_tab') elif self.language in ['zh', 'cn']: def sent_tokenize(para): diff --git a/modelscope/utils/streaming_output.py b/modelscope/utils/streaming_output.py index 8de808bd..1b93432a 100644 --- a/modelscope/utils/streaming_output.py +++ b/modelscope/utils/streaming_output.py @@ -178,7 +178,8 @@ class PretrainedModelStreamingOutputMixin(StreamingOutputMixin): if version.parse(transformers.__version__) >= version.parse('4.43.0'): greedy_search_name = 'stream_greedy_search' sample_name = '_sample' - elif version.parse(transformers.__version__) >= version.parse('4.39.0'): + elif version.parse( + transformers.__version__) >= version.parse('4.39.0'): greedy_search_name = '_greedy_search' sample_name = '_sample' else: @@ -452,7 +453,8 @@ class PretrainedModelStreamingOutputMixin(StreamingOutputMixin): break # prepare model inputs - model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) + model_kwargs = self._get_initial_cache_position( + input_ids, model_kwargs) model_inputs = self.prepare_inputs_for_generation( input_ids, **model_kwargs)