From 2ec18b89da26acd7772a4ccf00211b6097f6a252 Mon Sep 17 00:00:00 2001
From: suluyan <suluyan.sly@alibaba-inc.com>
Date: Fri, 20 Dec 2024 16:46:24 +0800
Subject: [PATCH] fix punkt

---
 .dev_scripts/ci_container_test.sh                     |  1 +
 modelscope/pipelines/nlp/llm_pipeline.py              |  3 ++-
 .../preprocessors/nlp/fill_mask_preprocessor.py       | 11 +++++++----
 modelscope/utils/streaming_output.py                  |  6 ++++--
 4 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/.dev_scripts/ci_container_test.sh b/.dev_scripts/ci_container_test.sh
index 1782693e..31556192 100644
--- a/.dev_scripts/ci_container_test.sh
+++ b/.dev_scripts/ci_container_test.sh
@@ -31,6 +31,7 @@ if [ "$MODELSCOPE_SDK_DEBUG" == "True" ]; then
     python -m spacy download en_core_web_sm
     pip install faiss-gpu
     pip install healpy
+    pip install huggingface-hub==0.25.2
     # test with install
     pip install .
 else
diff --git a/modelscope/pipelines/nlp/llm_pipeline.py b/modelscope/pipelines/nlp/llm_pipeline.py
index 2c08c498..3199d7fa 100644
--- a/modelscope/pipelines/nlp/llm_pipeline.py
+++ b/modelscope/pipelines/nlp/llm_pipeline.py
@@ -269,7 +269,8 @@ class LLMPipeline(Pipeline, PipelineStreamingOutputMixin):
 
         assert model_id.lower() in SWIFT_MODEL_ID_MAPPING,\
             f'Invalid model id {model_id} or Swift framework does not support this model.'
-        args = InferArguments(model_type=SWIFT_MODEL_ID_MAPPING[model_id.lower()])
+        args = InferArguments(
+            model_type=SWIFT_MODEL_ID_MAPPING[model_id.lower()])
         model, template = prepare_model_template(
             args, device_map=self.device_map)
         self.model = add_stream_generate(model)
diff --git a/modelscope/preprocessors/nlp/fill_mask_preprocessor.py b/modelscope/preprocessors/nlp/fill_mask_preprocessor.py
index f43e03ed..c5113f35 100644
--- a/modelscope/preprocessors/nlp/fill_mask_preprocessor.py
+++ b/modelscope/preprocessors/nlp/fill_mask_preprocessor.py
@@ -213,11 +213,14 @@ class FillMaskPoNetPreprocessor(FillMaskPreprocessorBase):
             osp.join(model_dir, ModelFile.CONFIGURATION))
         self.language = self.cfg.model.get('language', 'en')
         if self.language == 'en':
-            from nltk.tokenize import sent_tokenize
             import nltk
-            nltk.download('punkt_tab')
-            # import_external_nltk_data(
-            #     osp.join(model_dir, 'nltk_data'), 'tokenizers/punkt_tab')
+            from nltk.tokenize import sent_tokenize
+            from packaging import version
+            if version.parse(nltk.__version__) >= version.parse('3.8.2'):
+                nltk.download('punkt_tab')
+            else:
+                import_external_nltk_data(
+                    osp.join(model_dir, 'nltk_data'), 'tokenizers/punkt_tab')
         elif self.language in ['zh', 'cn']:
 
             def sent_tokenize(para):
diff --git a/modelscope/utils/streaming_output.py b/modelscope/utils/streaming_output.py
index 8de808bd..1b93432a 100644
--- a/modelscope/utils/streaming_output.py
+++ b/modelscope/utils/streaming_output.py
@@ -178,7 +178,8 @@ class PretrainedModelStreamingOutputMixin(StreamingOutputMixin):
         if version.parse(transformers.__version__) >= version.parse('4.43.0'):
             greedy_search_name = 'stream_greedy_search'
             sample_name = '_sample'
-        elif version.parse(transformers.__version__) >= version.parse('4.39.0'):
+        elif version.parse(
+                transformers.__version__) >= version.parse('4.39.0'):
             greedy_search_name = '_greedy_search'
             sample_name = '_sample'
         else:
@@ -452,7 +453,8 @@ class PretrainedModelStreamingOutputMixin(StreamingOutputMixin):
                     break
 
             # prepare model inputs
-            model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
+            model_kwargs = self._get_initial_cache_position(
+                input_ids, model_kwargs)
             model_inputs = self.prepare_inputs_for_generation(
                 input_ids, **model_kwargs)