From 0a643276ca14226487a5230466ac5d30e8a13216 Mon Sep 17 00:00:00 2001
From: suluyana <suluyan_sly@163.com>
Date: Tue, 7 Jan 2025 21:04:29 +0800
Subject: [PATCH] warning parsing trust_remote_code=True

---
 modelscope/models/nlp/hf_transformers/backbone.py  |  3 +++
 modelscope/models/nlp/polylm/text_generation.py    |  3 +++
 modelscope/msdatasets/data_loader/data_loader.py   |  5 +++++
 .../msdatasets/data_loader/data_loader_manager.py  |  9 +++++++++
 modelscope/msdatasets/ms_dataset.py                |  5 +++++
 modelscope/msdatasets/utils/hf_datasets_util.py    | 14 ++++++++++++++
 modelscope/pipelines/accelerate/vllm.py            |  6 +++++-
 .../pipelines/multi_modal/ovis_vl_pipeline.py      |  7 +++++--
 modelscope/pipelines/nlp/llm_pipeline.py           | 12 ++++++++++++
 .../pipelines/nlp/text_generation_pipeline.py      | 12 ++++++++++++
 modelscope/preprocessors/templates/loader.py       |  3 +++
 modelscope/utils/automodel_utils.py                |  6 +++++-
 12 files changed, 81 insertions(+), 4 deletions(-)

diff --git a/modelscope/models/nlp/hf_transformers/backbone.py b/modelscope/models/nlp/hf_transformers/backbone.py
index 5b9a3965..10681d8b 100644
--- a/modelscope/models/nlp/hf_transformers/backbone.py
+++ b/modelscope/models/nlp/hf_transformers/backbone.py
@@ -99,6 +99,9 @@ class TransformersModel(TorchModel, PreTrainedModel):
             return model
 
         # return the model only
+        logger.warning('Use trust_remote_code=True. The code will be downloaded'
+                       ' and used from the remote repo. Please make sure that'
+                       f' the remote code content is what you need  {model_dir}.')
         config, kwargs = AutoConfig.from_pretrained(
             model_dir,
             return_unused_kwargs=True,
diff --git a/modelscope/models/nlp/polylm/text_generation.py b/modelscope/models/nlp/polylm/text_generation.py
index 1881cf2b..cf53157a 100644
--- a/modelscope/models/nlp/polylm/text_generation.py
+++ b/modelscope/models/nlp/polylm/text_generation.py
@@ -27,6 +27,9 @@ class PolyLMForTextGeneration(TorchModel, StreamingOutputMixin):
         super().__init__(model_dir, *args, **kwargs)
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_dir, legacy=False, use_fast=False)
+        logger.warning('Use trust_remote_code=True. The code will be downloaded'
+                       ' and used from the remote repo. Please make sure that'
+                       f' the remote code content is what you need  {model_dir}.')
         self.model = AutoModelForCausalLM.from_pretrained(
             model_dir, device_map='auto', trust_remote_code=True)
         self.model.eval()
diff --git a/modelscope/msdatasets/data_loader/data_loader.py b/modelscope/msdatasets/data_loader/data_loader.py
index 92074449..9e1583f4 100644
--- a/modelscope/msdatasets/data_loader/data_loader.py
+++ b/modelscope/msdatasets/data_loader/data_loader.py
@@ -133,6 +133,11 @@ class OssDownloader(BaseDownloader):
             raise f'meta-file: {dataset_name}.py not found on the modelscope hub.'
 
         if dataset_py_script and dataset_formation == DatasetFormations.hf_compatible:
+            if trust_remote_code:
+                logger.warning('Use trust_remote_code=True. The code will be downloaded'
+                               ' and used from the remote repo. Please make sure that'
+                               f' the remote code content is what you need  {dataset_name}.')
+
             self.dataset = hf_load_dataset(
                 dataset_py_script,
                 name=subset_name,
diff --git a/modelscope/msdatasets/data_loader/data_loader_manager.py b/modelscope/msdatasets/data_loader/data_loader_manager.py
index a9e58b7c..d59fc1d6 100644
--- a/modelscope/msdatasets/data_loader/data_loader_manager.py
+++ b/modelscope/msdatasets/data_loader/data_loader_manager.py
@@ -71,6 +71,11 @@ class LocalDataLoaderManager(DataLoaderManager):
         # Select local data loader
         # TODO: more loaders to be supported.
         if data_loader_type == LocalDataLoaderType.HF_DATA_LOADER:
+            if trust_remote_code:
+                logger.warning('Use trust_remote_code=True. The code will be downloaded'
+                               ' and used from the remote repo. Please make sure that'
+                               f' the remote code content is what you need  {dataset_name}.')
+
             # Build huggingface data loader and return dataset.
             return hf_data_loader(
                 dataset_name,
@@ -110,6 +115,10 @@ class RemoteDataLoaderManager(DataLoaderManager):
 
         # To use the huggingface data loader
         if data_loader_type == RemoteDataLoaderType.HF_DATA_LOADER:
+            if trust_remote_code:
+                logger.warning('Use trust_remote_code=True. The code will be downloaded'
+                               ' and used from the remote repo. Please make sure that'
+                               f' the remote code content is what you need  {dataset_name}.')
             dataset_ret = hf_data_loader(
                 dataset_name,
                 name=subset_name,
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index 899142ad..28de17f6 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -237,6 +237,11 @@ class MsDataset:
             if not namespace or not dataset_name:
                 raise 'The dataset_name should be in the form of `namespace/dataset_name` or `dataset_name`.'
 
+        if trust_remote_code:
+            logger.warning('Use trust_remote_code=True. The code will be downloaded'
+                           ' and used from the remote repo. Please make sure that'
+                           f' the remote code content is what you need  {dataset_name}.')
+
         # Init context config
         dataset_context_config = DatasetContextConfig(
             dataset_name=dataset_name,
diff --git a/modelscope/msdatasets/utils/hf_datasets_util.py b/modelscope/msdatasets/utils/hf_datasets_util.py
index 1e9e9970..8f933ceb 100644
--- a/modelscope/msdatasets/utils/hf_datasets_util.py
+++ b/modelscope/msdatasets/utils/hf_datasets_util.py
@@ -936,6 +936,11 @@ class DatasetsWrapperHF:
             verification_mode or VerificationMode.BASIC_CHECKS
         ) if not save_infos else VerificationMode.ALL_CHECKS)
 
+        if trust_remote_code:
+            logger.warning('Use trust_remote_code=True. The code will be downloaded'
+                           ' and used from the remote repo. Please make sure that'
+                           f' the remote code content is what you need  {path}.')
+
         # Create a dataset builder
         builder_instance = DatasetsWrapperHF.load_dataset_builder(
             path=path,
@@ -1063,6 +1068,11 @@ class DatasetsWrapperHF:
             ) if download_config else DownloadConfig()
             download_config.storage_options.update(storage_options)
 
+        if trust_remote_code:
+            logger.warning('Use trust_remote_code=True. The code will be downloaded'
+                           ' and used from the remote repo. Please make sure that'
+                           f' the remote code content is what you need  {path}.')
+
         dataset_module = DatasetsWrapperHF.dataset_module_factory(
             path,
             revision=revision,
@@ -1173,6 +1183,10 @@ class DatasetsWrapperHF:
         #   -> the module from the python file in the dataset repository
         # - if path has one "/" and is dataset repository on the HF hub without a python file
         #   -> use a packaged module (csv, text etc.) based on content of the repository
+        if trust_remote_code:
+            logger.warning('Use trust_remote_code=True. The code will be downloaded'
+                           ' and used from the remote repo. Please make sure that'
+                           f' the remote code content is what you need  {path}.')
 
         # Try packaged
         if path in _PACKAGED_DATASETS_MODULES:
diff --git a/modelscope/pipelines/accelerate/vllm.py b/modelscope/pipelines/accelerate/vllm.py
index 15ced4bb..599f520a 100644
--- a/modelscope/pipelines/accelerate/vllm.py
+++ b/modelscope/pipelines/accelerate/vllm.py
@@ -2,7 +2,8 @@ from typing import List, Union
 
 from modelscope.pipelines.accelerate.base import InferFramework
 from modelscope.utils.import_utils import is_vllm_available
-
+from modelscope import get_logger
+logger = get_logger()
 
 class Vllm(InferFramework):
 
@@ -27,6 +28,9 @@ class Vllm(InferFramework):
         if not Vllm.check_gpu_compatibility(8) and (dtype
                                                     in ('bfloat16', 'auto')):
             dtype = 'float16'
+        logger.warning('Use trust_remote_code=True. The code will be downloaded'
+                       ' and used from the remote repo. Please make sure that'
+                       f' the remote code content is what you need  {self.model_dir}.')
         self.model = LLM(
             self.model_dir,
             dtype=dtype,
diff --git a/modelscope/pipelines/multi_modal/ovis_vl_pipeline.py b/modelscope/pipelines/multi_modal/ovis_vl_pipeline.py
index f19eddff..5e1e18a9 100644
--- a/modelscope/pipelines/multi_modal/ovis_vl_pipeline.py
+++ b/modelscope/pipelines/multi_modal/ovis_vl_pipeline.py
@@ -2,7 +2,7 @@ from typing import Any, Dict, Union
 
 import torch
 
-from modelscope import AutoModelForCausalLM
+from modelscope import AutoModelForCausalLM, get_logger
 from modelscope.metainfo import Pipelines, Preprocessors
 from modelscope.models.base import Model
 from modelscope.outputs import OutputKeys
@@ -12,7 +12,7 @@ from modelscope.pipelines.multi_modal.visual_question_answering_pipeline import
     VisualQuestionAnsweringPipeline
 from modelscope.preprocessors import Preprocessor, load_image
 from modelscope.utils.constant import Fields, Frameworks, Tasks
-
+logger = get_logger()
 
 @PIPELINES.register_module(
     Tasks.visual_question_answering, module_name='ovis-vl')
@@ -35,6 +35,9 @@ class VisionChatPipeline(VisualQuestionAnsweringPipeline):
         torch_dtype = kwargs.get('torch_dtype', torch.float16)
         multimodal_max_length = kwargs.get('multimodal_max_length', 8192)
         self.device = 'cuda' if device == 'gpu' else device
+        logger.warning('Use trust_remote_code=True. The code will be downloaded'
+                       ' and used from the remote repo. Please make sure that'
+                       f' the remote code content is what you need  {model}.')
         self.model = AutoModelForCausalLM.from_pretrained(
             model,
             torch_dtype=torch_dtype,
diff --git a/modelscope/pipelines/nlp/llm_pipeline.py b/modelscope/pipelines/nlp/llm_pipeline.py
index 7ad0d278..ceaa4c90 100644
--- a/modelscope/pipelines/nlp/llm_pipeline.py
+++ b/modelscope/pipelines/nlp/llm_pipeline.py
@@ -97,6 +97,9 @@ class LLMPipeline(Pipeline, PipelineStreamingOutputMixin):
             assert base_model is not None, 'Cannot get adapter_cfg.model_id_or_path from configuration.json file.'
             revision = self.cfg.safe_get('adapter_cfg.model_revision',
                                          'master')
+            logger.warning('Use trust_remote_code=True. The code will be downloaded'
+                           ' and used from the remote repo. Please make sure that'
+                           f' the remote code content is what you need  {base_model}.')
             base_model = Model.from_pretrained(
                 base_model,
                 revision,
@@ -134,6 +137,9 @@ class LLMPipeline(Pipeline, PipelineStreamingOutputMixin):
                     model) else snapshot_download(model)
                 # TODO: Temporary use of AutoModelForCausalLM
                 # Need to be updated into a universal solution
+                logger.warning('Use trust_remote_code=True. The code will be downloaded'
+                               ' and used from the remote repo. Please make sure that'
+                               f' the remote code content is what you need  {model_dir}.')
                 model = AutoModelForCausalLM.from_pretrained(
                     model_dir,
                     device_map=self.device_map,
@@ -173,6 +179,9 @@ class LLMPipeline(Pipeline, PipelineStreamingOutputMixin):
         self.llm_framework = llm_framework
 
         if os.path.exists(kwargs['model']):
+            logger.warning('Use trust_remote_code=True. The code will be downloaded'
+                           ' and used from the remote repo. Please make sure that'
+                           f' the remote code content is what you need  {kwargs['model']}.')
             config = AutoConfig.from_pretrained(
                 kwargs['model'], trust_remote_code=True)
             q_config = config.__dict__.get('quantization_config', None)
@@ -423,6 +432,9 @@ class LLMPipeline(Pipeline, PipelineStreamingOutputMixin):
             model_dir = self.model.model_dir
         if tokenizer_class is None:
             tokenizer_class = AutoTokenizer
+        logger.warning('Use trust_remote_code=True. The code will be downloaded'
+                       ' and used from the remote repo. Please make sure that'
+                       f' the remote code content is what you need  {model_dir}.')
         return tokenizer_class.from_pretrained(
             model_dir, trust_remote_code=True)
 
diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py
index 55eaf809..555e7a9d 100644
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -269,6 +269,9 @@ class ChatGLM6bV2TextGenerationPipeline(Pipeline):
             if use_bf16:
                 default_torch_dtype = torch.bfloat16
             torch_dtype = kwargs.get('torch_dtype', default_torch_dtype)
+            logger.warning('Use trust_remote_code=True. The code will be downloaded'
+                           ' and used from the remote repo. Please make sure that'
+                           f' the remote code content is what you need  {model_dir}.')
             model = Model.from_pretrained(
                 model_dir,
                 trust_remote_code=True,
@@ -285,6 +288,9 @@ class ChatGLM6bV2TextGenerationPipeline(Pipeline):
 
         self.model = model
         self.model.eval()
+        logger.warning('Use trust_remote_code=True. The code will be downloaded'
+                       ' and used from the remote repo. Please make sure that'
+                       f' the remote code content is what you need  {self.model.model_dir}.')
         self.tokenizer = AutoTokenizer.from_pretrained(
             self.model.model_dir, trust_remote_code=True)
 
@@ -328,6 +334,9 @@ class QWenChatPipeline(Pipeline):
             bf16 = False
 
         if isinstance(model, str):
+            logger.warning('Use trust_remote_code=True. The code will be downloaded'
+                           ' and used from the remote repo. Please make sure that'
+                           f' the remote code content is what you need  {model}.')
             self.tokenizer = AutoTokenizer.from_pretrained(
                 model, revision=revision, trust_remote_code=True)
             self.model = AutoModelForCausalLM.from_pretrained(
@@ -392,6 +401,9 @@ class QWenTextGenerationPipeline(Pipeline):
             bf16 = False
 
         if isinstance(model, str):
+            logger.warning('Use trust_remote_code=True. The code will be downloaded'
+                           ' and used from the remote repo. Please make sure that'
+                           f' the remote code content is what you need  {model}.')
             self.model = AutoModelForCausalLM.from_pretrained(
                 model,
                 device_map=device_map,
diff --git a/modelscope/preprocessors/templates/loader.py b/modelscope/preprocessors/templates/loader.py
index 8943f25d..b9da3608 100644
--- a/modelscope/preprocessors/templates/loader.py
+++ b/modelscope/preprocessors/templates/loader.py
@@ -820,6 +820,9 @@ class TemplateLoader:
                                 model_id,
                                 revision=kwargs.pop('revision', 'master'),
                                 ignore_file_pattern=ignore_file_pattern)
+                            logger.warning('Use trust_remote_code=True. The code will be downloaded'
+                                           ' and used from the remote repo. Please make sure that'
+                                           f' the remote code content is what you need  {model_dir}.')
                             tokenizer = AutoTokenizer.from_pretrained(
                                 model_dir, trust_remote_code=True)
                             config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
diff --git a/modelscope/utils/automodel_utils.py b/modelscope/utils/automodel_utils.py
index eb4aa6c8..435d461d 100644
--- a/modelscope/utils/automodel_utils.py
+++ b/modelscope/utils/automodel_utils.py
@@ -8,7 +8,8 @@ from modelscope.utils.ast_utils import INDEX_KEY
 from modelscope.utils.import_utils import (LazyImportModule,
                                            is_torch_available,
                                            is_transformers_available)
-
+from modelscope import get_logger
+logger = get_logger()
 
 def can_load_by_ms(model_dir: str, task_name: Optional[str],
                    model_type: Optional[str]) -> bool:
@@ -91,6 +92,9 @@ def get_hf_automodel_class(model_dir: str,
     if not os.path.exists(config_path):
         return None
     try:
+        logger.warning('Use trust_remote_code=True. The code will be downloaded'
+                       ' and used from the remote repo. Please make sure that'
+                       f' the remote code content is what you need  {model_dir}.')
         config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
         if task_name is None:
             automodel_class = get_default_automodel(config)